fix: prevent session closing after each request to maintain connection pool. Fixes: https://github.com/unclecode/crawl4ai/issues/867

This commit is contained in:
Aravind Karnam
2025-03-25 13:46:55 +05:30
parent 2f0e217751
commit e3111d0a32

View File

@@ -1702,15 +1702,6 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
await self.close() await self.close()
@contextlib.asynccontextmanager
async def _session_context(self):
try:
if not self._session:
await self.start()
yield self._session
finally:
await self.close()
def set_hook(self, hook_type: str, hook_func: Callable) -> None: def set_hook(self, hook_type: str, hook_func: Callable) -> None:
if hook_type in self.hooks: if hook_type in self.hooks:
self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func) self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func)
@@ -1787,75 +1778,77 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
url: str, url: str,
config: CrawlerRunConfig config: CrawlerRunConfig
) -> AsyncCrawlResponse: ) -> AsyncCrawlResponse:
async with self._session_context() as session: if not self._session or self._session.closed:
timeout = ClientTimeout( await self.start()
total=config.page_timeout or self.DEFAULT_TIMEOUT,
connect=10,
sock_read=30
)
headers = dict(self._BASE_HEADERS) timeout = ClientTimeout(
if self.browser_config.headers: total=config.page_timeout or self.DEFAULT_TIMEOUT,
headers.update(self.browser_config.headers) connect=10,
sock_read=30
)
request_kwargs = { headers = dict(self._BASE_HEADERS)
'timeout': timeout, if self.browser_config.headers:
'allow_redirects': self.browser_config.follow_redirects, headers.update(self.browser_config.headers)
'ssl': self.browser_config.verify_ssl,
'headers': headers
}
if self.browser_config.method == "POST": request_kwargs = {
if self.browser_config.data: 'timeout': timeout,
request_kwargs['data'] = self.browser_config.data 'allow_redirects': self.browser_config.follow_redirects,
if self.browser_config.json: 'ssl': self.browser_config.verify_ssl,
request_kwargs['json'] = self.browser_config.json 'headers': headers
}
await self.hooks['before_request'](url, request_kwargs) if self.browser_config.method == "POST":
if self.browser_config.data:
request_kwargs['data'] = self.browser_config.data
if self.browser_config.json:
request_kwargs['json'] = self.browser_config.json
try: await self.hooks['before_request'](url, request_kwargs)
async with session.request(self.browser_config.method, url, **request_kwargs) as response:
content = memoryview(await response.read())
if not (200 <= response.status < 300): try:
raise HTTPStatusError( async with self._session.request(self.browser_config.method, url, **request_kwargs) as response:
response.status, content = memoryview(await response.read())
f"Unexpected status code for {url}"
)
encoding = response.charset if not (200 <= response.status < 300):
if not encoding: raise HTTPStatusError(
encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8' response.status,
f"Unexpected status code for {url}"
result = AsyncCrawlResponse(
html=content.tobytes().decode(encoding, errors='replace'),
response_headers=dict(response.headers),
status_code=response.status,
redirected_url=str(response.url)
) )
await self.hooks['after_request'](result) encoding = response.charset
return result if not encoding:
encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8'
except aiohttp.ServerTimeoutError as e: result = AsyncCrawlResponse(
await self.hooks['on_error'](e) html=content.tobytes().decode(encoding, errors='replace'),
raise ConnectionTimeoutError(f"Request timed out: {str(e)}") response_headers=dict(response.headers),
status_code=response.status,
redirected_url=str(response.url)
)
except aiohttp.ClientConnectorError as e: await self.hooks['after_request'](result)
await self.hooks['on_error'](e) return result
raise ConnectionError(f"Connection failed: {str(e)}")
except aiohttp.ClientError as e: except aiohttp.ServerTimeoutError as e:
await self.hooks['on_error'](e) await self.hooks['on_error'](e)
raise HTTPCrawlerError(f"HTTP client error: {str(e)}") raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
except asyncio.exceptions.TimeoutError as e: except aiohttp.ClientConnectorError as e:
await self.hooks['on_error'](e) await self.hooks['on_error'](e)
raise ConnectionTimeoutError(f"Request timed out: {str(e)}") raise ConnectionError(f"Connection failed: {str(e)}")
except Exception as e: except aiohttp.ClientError as e:
await self.hooks['on_error'](e) await self.hooks['on_error'](e)
raise HTTPCrawlerError(f"HTTP request failed: {str(e)}") raise HTTPCrawlerError(f"HTTP client error: {str(e)}")
except asyncio.exceptions.TimeoutError as e:
await self.hooks['on_error'](e)
raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
except Exception as e:
await self.hooks['on_error'](e)
raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
async def crawl( async def crawl(
self, self,