fix: Revert changes to session management in AsyncHttpWebcrawler and solve the underlying issue by removing the session closure in finally block of session context.

This commit is contained in:
Aravind Karnam
2025-04-08 18:31:00 +05:30
parent 7155778eac
commit 6f7ab9c927

View File

@@ -1706,6 +1706,15 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
await self.close() await self.close()
@contextlib.asynccontextmanager
async def _session_context(self):
try:
if not self._session:
await self.start()
yield self._session
finally:
pass
def set_hook(self, hook_type: str, hook_func: Callable) -> None: def set_hook(self, hook_type: str, hook_func: Callable) -> None:
if hook_type in self.hooks: if hook_type in self.hooks:
self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func) self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func)
@@ -1782,77 +1791,75 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
url: str, url: str,
config: CrawlerRunConfig config: CrawlerRunConfig
) -> AsyncCrawlResponse: ) -> AsyncCrawlResponse:
if not self._session or self._session.closed: async with self._session_context() as session:
await self.start() timeout = ClientTimeout(
total=config.page_timeout or self.DEFAULT_TIMEOUT,
connect=10,
sock_read=30
)
timeout = ClientTimeout( headers = dict(self._BASE_HEADERS)
total=config.page_timeout or self.DEFAULT_TIMEOUT, if self.browser_config.headers:
connect=10, headers.update(self.browser_config.headers)
sock_read=30
)
headers = dict(self._BASE_HEADERS) request_kwargs = {
if self.browser_config.headers: 'timeout': timeout,
headers.update(self.browser_config.headers) 'allow_redirects': self.browser_config.follow_redirects,
'ssl': self.browser_config.verify_ssl,
'headers': headers
}
request_kwargs = { if self.browser_config.method == "POST":
'timeout': timeout, if self.browser_config.data:
'allow_redirects': self.browser_config.follow_redirects, request_kwargs['data'] = self.browser_config.data
'ssl': self.browser_config.verify_ssl, if self.browser_config.json:
'headers': headers request_kwargs['json'] = self.browser_config.json
}
if self.browser_config.method == "POST": await self.hooks['before_request'](url, request_kwargs)
if self.browser_config.data:
request_kwargs['data'] = self.browser_config.data
if self.browser_config.json:
request_kwargs['json'] = self.browser_config.json
await self.hooks['before_request'](url, request_kwargs) try:
async with session.request(self.browser_config.method, url, **request_kwargs) as response:
content = memoryview(await response.read())
try: if not (200 <= response.status < 300):
async with self._session.request(self.browser_config.method, url, **request_kwargs) as response: raise HTTPStatusError(
content = memoryview(await response.read()) response.status,
f"Unexpected status code for {url}"
)
if not (200 <= response.status < 300): encoding = response.charset
raise HTTPStatusError( if not encoding:
response.status, encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'
f"Unexpected status code for {url}"
result = AsyncCrawlResponse(
html=content.tobytes().decode(encoding, errors='replace'),
response_headers=dict(response.headers),
status_code=response.status,
redirected_url=str(response.url)
) )
encoding = response.charset await self.hooks['after_request'](result)
if not encoding: return result
encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'
result = AsyncCrawlResponse( except aiohttp.ServerTimeoutError as e:
html=content.tobytes().decode(encoding, errors='replace'), await self.hooks['on_error'](e)
response_headers=dict(response.headers), raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
status_code=response.status,
redirected_url=str(response.url)
)
await self.hooks['after_request'](result) except aiohttp.ClientConnectorError as e:
return result await self.hooks['on_error'](e)
raise ConnectionError(f"Connection failed: {str(e)}")
except aiohttp.ServerTimeoutError as e: except aiohttp.ClientError as e:
await self.hooks['on_error'](e) await self.hooks['on_error'](e)
raise ConnectionTimeoutError(f"Request timed out: {str(e)}") raise HTTPCrawlerError(f"HTTP client error: {str(e)}")
except aiohttp.ClientConnectorError as e: except asyncio.exceptions.TimeoutError as e:
await self.hooks['on_error'](e) await self.hooks['on_error'](e)
raise ConnectionError(f"Connection failed: {str(e)}") raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
except aiohttp.ClientError as e: except Exception as e:
await self.hooks['on_error'](e) await self.hooks['on_error'](e)
raise HTTPCrawlerError(f"HTTP client error: {str(e)}") raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
except asyncio.exceptions.TimeoutError as e:
await self.hooks['on_error'](e)
raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
except Exception as e:
await self.hooks['on_error'](e)
raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
async def crawl( async def crawl(
self, self,