From d30dc9fdc1138be3f409057dac74744e2882f6f2 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Sat, 16 Aug 2025 09:27:23 +0800 Subject: [PATCH] fix(http-crawler): bring back HTTP crawler strategy --- crawl4ai/async_crawler_strategy.py | 262 +++++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 8cb83ed4..943867d0 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -2129,3 +2129,265 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return True # Default to scrolling if check fails +#################################################################################################### +# HTTP Crawler Strategy +#################################################################################################### + +class HTTPCrawlerError(Exception): + """Base error class for HTTP crawler specific exceptions""" + pass + + +class ConnectionTimeoutError(HTTPCrawlerError): + """Raised when connection timeout occurs""" + pass + + +class HTTPStatusError(HTTPCrawlerError): + """Raised for unexpected status codes""" + def __init__(self, status_code: int, message: str): + self.status_code = status_code + super().__init__(f"HTTP {status_code}: {message}") + + +class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): + """ + Fast, lightweight HTTP-only crawler strategy optimized for memory efficiency. + """ + + __slots__ = ('logger', 'max_connections', 'dns_cache_ttl', 'chunk_size', '_session', 'hooks', 'browser_config') + + DEFAULT_TIMEOUT: Final[int] = 30 + DEFAULT_CHUNK_SIZE: Final[int] = 64 * 1024 + DEFAULT_MAX_CONNECTIONS: Final[int] = min(32, (os.cpu_count() or 1) * 4) + DEFAULT_DNS_CACHE_TTL: Final[int] = 300 + VALID_SCHEMES: Final = frozenset({'http', 'https', 'file', 'raw'}) + + _BASE_HEADERS: Final = MappingProxyType({ + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + }) + + def __init__( + self, + browser_config: Optional[HTTPCrawlerConfig] = None, + logger: Optional[AsyncLogger] = None, + max_connections: int = DEFAULT_MAX_CONNECTIONS, + dns_cache_ttl: int = DEFAULT_DNS_CACHE_TTL, + chunk_size: int = DEFAULT_CHUNK_SIZE + ): + """Initialize the HTTP crawler with config""" + self.browser_config = browser_config or HTTPCrawlerConfig() + self.logger = logger + self.max_connections = max_connections + self.dns_cache_ttl = dns_cache_ttl + self.chunk_size = chunk_size + self._session: Optional[aiohttp.ClientSession] = None + + self.hooks = { + k: partial(self._execute_hook, k) + for k in ('before_request', 'after_request', 'on_error') + } + + # Set default hooks + self.set_hook('before_request', lambda *args, **kwargs: None) + self.set_hook('after_request', lambda *args, **kwargs: None) + self.set_hook('on_error', lambda *args, **kwargs: None) + + + async def __aenter__(self) -> AsyncHTTPCrawlerStrategy: + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + await self.close() + + @contextlib.asynccontextmanager + async def _session_context(self): + try: + if not self._session: + await self.start() + yield self._session + finally: + pass + + def set_hook(self, hook_type: str, hook_func: Callable) -> None: + if hook_type in self.hooks: + self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func) + else: + raise ValueError(f"Invalid hook type: {hook_type}") + + async def _execute_hook( + self, + hook_type: str, + hook_func: Callable, + *args: Any, + **kwargs: Any + ) -> Any: + if asyncio.iscoroutinefunction(hook_func): + return await hook_func(*args, **kwargs) + return hook_func(*args, **kwargs) + + async def start(self) -> None: + if not self._session: + connector = aiohttp.TCPConnector( + limit=self.max_connections, + ttl_dns_cache=self.dns_cache_ttl, + use_dns_cache=True, + force_close=False + ) + self._session = aiohttp.ClientSession( + headers=dict(self._BASE_HEADERS), + connector=connector, + timeout=ClientTimeout(total=self.DEFAULT_TIMEOUT) + ) + + async def close(self) -> None: + if self._session and not self._session.closed: + try: + await asyncio.wait_for(self._session.close(), timeout=5.0) + except asyncio.TimeoutError: + if self.logger: + self.logger.warning( + message="Session cleanup timed out", + tag="CLEANUP" + ) + finally: + self._session = None + + async def _stream_file(self, path: str) -> AsyncGenerator[memoryview, None]: + async with aiofiles.open(path, mode='rb') as f: + while chunk := await f.read(self.chunk_size): + yield memoryview(chunk) + + async def _handle_file(self, path: str) -> AsyncCrawlResponse: + if not os.path.exists(path): + raise FileNotFoundError(f"Local file not found: {path}") + + chunks = [] + async for chunk in self._stream_file(path): + chunks.append(chunk.tobytes().decode('utf-8', errors='replace')) + + return AsyncCrawlResponse( + html=''.join(chunks), + response_headers={}, + status_code=200 + ) + + async def _handle_raw(self, content: str) -> AsyncCrawlResponse: + return AsyncCrawlResponse( + html=content, + response_headers={}, + status_code=200 + ) + + + async def _handle_http( + self, + url: str, + config: CrawlerRunConfig + ) -> AsyncCrawlResponse: + async with self._session_context() as session: + timeout = ClientTimeout( + total=config.page_timeout or self.DEFAULT_TIMEOUT, + connect=10, + sock_read=30 + ) + + headers = dict(self._BASE_HEADERS) + if self.browser_config.headers: + headers.update(self.browser_config.headers) + + request_kwargs = { + 'timeout': timeout, + 'allow_redirects': self.browser_config.follow_redirects, + 'ssl': self.browser_config.verify_ssl, + 'headers': headers + } + + if self.browser_config.method == "POST": + if self.browser_config.data: + request_kwargs['data'] = self.browser_config.data + if self.browser_config.json: + request_kwargs['json'] = self.browser_config.json + + await self.hooks['before_request'](url, request_kwargs) + + try: + async with session.request(self.browser_config.method, url, **request_kwargs) as response: + content = memoryview(await response.read()) + + if not (200 <= response.status < 300): + raise HTTPStatusError( + response.status, + f"Unexpected status code for {url}" + ) + + encoding = response.charset + if not encoding: + encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8' + + result = AsyncCrawlResponse( + html=content.tobytes().decode(encoding, errors='replace'), + response_headers=dict(response.headers), + status_code=response.status, + redirected_url=str(response.url) + ) + + await self.hooks['after_request'](result) + return result + + except aiohttp.ServerTimeoutError as e: + await self.hooks['on_error'](e) + raise ConnectionTimeoutError(f"Request timed out: {str(e)}") + + except aiohttp.ClientConnectorError as e: + await self.hooks['on_error'](e) + raise ConnectionError(f"Connection failed: {str(e)}") + + except aiohttp.ClientError as e: + await self.hooks['on_error'](e) + raise HTTPCrawlerError(f"HTTP client error: {str(e)}") + + except asyncio.exceptions.TimeoutError as e: + await self.hooks['on_error'](e) + raise ConnectionTimeoutError(f"Request timed out: {str(e)}") + + except Exception as e: + await self.hooks['on_error'](e) + raise HTTPCrawlerError(f"HTTP request failed: {str(e)}") + + async def crawl( + self, + url: str, + config: Optional[CrawlerRunConfig] = None, + **kwargs + ) -> AsyncCrawlResponse: + config = config or CrawlerRunConfig.from_kwargs(kwargs) + + parsed = urlparse(url) + scheme = parsed.scheme.rstrip('/') + + if scheme not in self.VALID_SCHEMES: + raise ValueError(f"Unsupported URL scheme: {scheme}") + + try: + if scheme == 'file': + return await self._handle_file(parsed.path) + elif scheme == 'raw': + return await self._handle_raw(parsed.path) + else: # http or https + return await self._handle_http(url, config) + + except Exception as e: + if self.logger: + self.logger.error( + message="Crawl failed: {error}", + tag="CRAWL", + params={"error": str(e), "url": url} + ) + raise