fix: Revert changes to session management in AsyncHttpWebcrawler and solve the underlying issue by removing the session closure in finally block of session context.

This commit is contained in:
Aravind Karnam
2025-04-08 18:31:00 +05:30
parent 7155778eac
commit 6f7ab9c927

View File

@@ -1706,6 +1706,15 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
await self.close() await self.close()
@contextlib.asynccontextmanager
async def _session_context(self):
try:
if not self._session:
await self.start()
yield self._session
finally:
pass
def set_hook(self, hook_type: str, hook_func: Callable) -> None: def set_hook(self, hook_type: str, hook_func: Callable) -> None:
if hook_type in self.hooks: if hook_type in self.hooks:
self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func) self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func)
@@ -1782,77 +1791,75 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
url: str, url: str,
config: CrawlerRunConfig config: CrawlerRunConfig
) -> AsyncCrawlResponse: ) -> AsyncCrawlResponse:
if not self._session or self._session.closed: async with self._session_context() as session:
await self.start() timeout = ClientTimeout(
total=config.page_timeout or self.DEFAULT_TIMEOUT,
timeout = ClientTimeout( connect=10,
total=config.page_timeout or self.DEFAULT_TIMEOUT, sock_read=30
connect=10, )
sock_read=30
) headers = dict(self._BASE_HEADERS)
if self.browser_config.headers:
headers = dict(self._BASE_HEADERS) headers.update(self.browser_config.headers)
if self.browser_config.headers:
headers.update(self.browser_config.headers)
request_kwargs = { request_kwargs = {
'timeout': timeout, 'timeout': timeout,
'allow_redirects': self.browser_config.follow_redirects, 'allow_redirects': self.browser_config.follow_redirects,
'ssl': self.browser_config.verify_ssl, 'ssl': self.browser_config.verify_ssl,
'headers': headers 'headers': headers
} }
if self.browser_config.method == "POST": if self.browser_config.method == "POST":
if self.browser_config.data: if self.browser_config.data:
request_kwargs['data'] = self.browser_config.data request_kwargs['data'] = self.browser_config.data
if self.browser_config.json: if self.browser_config.json:
request_kwargs['json'] = self.browser_config.json request_kwargs['json'] = self.browser_config.json
await self.hooks['before_request'](url, request_kwargs) await self.hooks['before_request'](url, request_kwargs)
try: try:
async with self._session.request(self.browser_config.method, url, **request_kwargs) as response: async with session.request(self.browser_config.method, url, **request_kwargs) as response:
content = memoryview(await response.read()) content = memoryview(await response.read())
if not (200 <= response.status < 300): if not (200 <= response.status < 300):
raise HTTPStatusError( raise HTTPStatusError(
response.status, response.status,
f"Unexpected status code for {url}" f"Unexpected status code for {url}"
)
encoding = response.charset
if not encoding:
encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'
result = AsyncCrawlResponse(
html=content.tobytes().decode(encoding, errors='replace'),
response_headers=dict(response.headers),
status_code=response.status,
redirected_url=str(response.url)
) )
encoding = response.charset await self.hooks['after_request'](result)
if not encoding: return result
encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'
result = AsyncCrawlResponse(
html=content.tobytes().decode(encoding, errors='replace'),
response_headers=dict(response.headers),
status_code=response.status,
redirected_url=str(response.url)
)
await self.hooks['after_request'](result)
return result
except aiohttp.ServerTimeoutError as e: except aiohttp.ServerTimeoutError as e:
await self.hooks['on_error'](e) await self.hooks['on_error'](e)
raise ConnectionTimeoutError(f"Request timed out: {str(e)}") raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
except aiohttp.ClientConnectorError as e:
await self.hooks['on_error'](e)
raise ConnectionError(f"Connection failed: {str(e)}")
except aiohttp.ClientError as e:
await self.hooks['on_error'](e)
raise HTTPCrawlerError(f"HTTP client error: {str(e)}")
except aiohttp.ClientConnectorError as e: except asyncio.exceptions.TimeoutError as e:
await self.hooks['on_error'](e) await self.hooks['on_error'](e)
raise ConnectionError(f"Connection failed: {str(e)}") raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
except aiohttp.ClientError as e: except Exception as e:
await self.hooks['on_error'](e) await self.hooks['on_error'](e)
raise HTTPCrawlerError(f"HTTP client error: {str(e)}") raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
except asyncio.exceptions.TimeoutError as e:
await self.hooks['on_error'](e)
raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
except Exception as e:
await self.hooks['on_error'](e)
raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
async def crawl( async def crawl(
self, self,