fix: Revert changes to session management in AsyncHttpWebcrawler and solve the underlying issue by removing the session closure in finally block of session context.

This commit is contained in:
Aravind Karnam
2025-04-08 18:31:00 +05:30
parent 7155778eac
commit 6f7ab9c927

View File

@@ -1706,6 +1706,15 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
await self.close()
@contextlib.asynccontextmanager
async def _session_context(self):
try:
if not self._session:
await self.start()
yield self._session
finally:
pass
def set_hook(self, hook_type: str, hook_func: Callable) -> None:
if hook_type in self.hooks:
self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func)
@@ -1782,9 +1791,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
url: str,
config: CrawlerRunConfig
) -> AsyncCrawlResponse:
if not self._session or self._session.closed:
await self.start()
async with self._session_context() as session:
timeout = ClientTimeout(
total=config.page_timeout or self.DEFAULT_TIMEOUT,
connect=10,
@@ -1811,7 +1818,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
await self.hooks['before_request'](url, request_kwargs)
try:
async with self._session.request(self.browser_config.method, url, **request_kwargs) as response:
async with session.request(self.browser_config.method, url, **request_kwargs) as response:
content = memoryview(await response.read())
if not (200 <= response.status < 300):