fix(http-crawler): bring back HTTP crawler strategy

2025-08-16 09:27:23 +08:00
parent e6044e6053
commit d30dc9fdc1
1 changed files with 262 additions and 0 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -2129,3 +2129,265 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            return True  # Default to scrolling if check fails


+####################################################################################################
+# HTTP Crawler Strategy
+####################################################################################################
+
+class HTTPCrawlerError(Exception):
+    """Base error class for HTTP crawler specific exceptions"""
+    pass
+
+
+class ConnectionTimeoutError(HTTPCrawlerError):
+    """Raised when connection timeout occurs"""
+    pass
+
+
+class HTTPStatusError(HTTPCrawlerError):
+    """Raised for unexpected status codes"""
+    def __init__(self, status_code: int, message: str):
+        self.status_code = status_code
+        super().__init__(f"HTTP {status_code}: {message}")
+
+
+class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
+    """
+    Fast, lightweight HTTP-only crawler strategy optimized for memory efficiency.
+    """
+    
+    __slots__ = ('logger', 'max_connections', 'dns_cache_ttl', 'chunk_size', '_session', 'hooks', 'browser_config')
+
+    DEFAULT_TIMEOUT: Final[int] = 30
+    DEFAULT_CHUNK_SIZE: Final[int] = 64 * 1024  
+    DEFAULT_MAX_CONNECTIONS: Final[int] = min(32, (os.cpu_count() or 1) * 4)
+    DEFAULT_DNS_CACHE_TTL: Final[int] = 300
+    VALID_SCHEMES: Final = frozenset({'http', 'https', 'file', 'raw'})
+
+    _BASE_HEADERS: Final = MappingProxyType({
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+    })
+    
+    def __init__(
+        self, 
+        browser_config: Optional[HTTPCrawlerConfig] = None,
+        logger: Optional[AsyncLogger] = None,
+        max_connections: int = DEFAULT_MAX_CONNECTIONS,
+        dns_cache_ttl: int = DEFAULT_DNS_CACHE_TTL,
+        chunk_size: int = DEFAULT_CHUNK_SIZE
+    ):
+        """Initialize the HTTP crawler with config"""
+        self.browser_config = browser_config or HTTPCrawlerConfig()
+        self.logger = logger
+        self.max_connections = max_connections
+        self.dns_cache_ttl = dns_cache_ttl
+        self.chunk_size = chunk_size
+        self._session: Optional[aiohttp.ClientSession] = None
+        
+        self.hooks = {
+            k: partial(self._execute_hook, k) 
+            for k in ('before_request', 'after_request', 'on_error')
+        }
+
+        # Set default hooks
+        self.set_hook('before_request', lambda *args, **kwargs: None)
+        self.set_hook('after_request', lambda *args, **kwargs: None)
+        self.set_hook('on_error', lambda *args, **kwargs: None)
+                      
+
+    async def __aenter__(self) -> AsyncHTTPCrawlerStrategy:
+        await self.start()
+        return self
+        
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        await self.close()
+
+    @contextlib.asynccontextmanager
+    async def _session_context(self):
+        try:
+            if not self._session:
+                await self.start()
+            yield self._session
+        finally:
+            pass
+
+    def set_hook(self, hook_type: str, hook_func: Callable) -> None:
+        if hook_type in self.hooks:
+            self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func)
+        else:
+            raise ValueError(f"Invalid hook type: {hook_type}")
+
+    async def _execute_hook(
+        self, 
+        hook_type: str, 
+        hook_func: Callable,
+        *args: Any, 
+        **kwargs: Any
+    ) -> Any:
+        if asyncio.iscoroutinefunction(hook_func):
+            return await hook_func(*args, **kwargs)
+        return hook_func(*args, **kwargs)
+
+    async def start(self) -> None:
+        if not self._session:
+            connector = aiohttp.TCPConnector(
+                limit=self.max_connections,
+                ttl_dns_cache=self.dns_cache_ttl,
+                use_dns_cache=True,
+                force_close=False
+            )
+            self._session = aiohttp.ClientSession(
+                headers=dict(self._BASE_HEADERS),
+                connector=connector,
+                timeout=ClientTimeout(total=self.DEFAULT_TIMEOUT)
+            )
+
+    async def close(self) -> None:
+        if self._session and not self._session.closed:
+            try:
+                await asyncio.wait_for(self._session.close(), timeout=5.0)
+            except asyncio.TimeoutError:
+                if self.logger:
+                    self.logger.warning(
+                        message="Session cleanup timed out",
+                        tag="CLEANUP"
+                    )
+            finally:
+                self._session = None
+
+    async def _stream_file(self, path: str) -> AsyncGenerator[memoryview, None]:
+        async with aiofiles.open(path, mode='rb') as f:
+            while chunk := await f.read(self.chunk_size):
+                yield memoryview(chunk)
+
+    async def _handle_file(self, path: str) -> AsyncCrawlResponse:
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"Local file not found: {path}")
+            
+        chunks = []
+        async for chunk in self._stream_file(path):
+            chunks.append(chunk.tobytes().decode('utf-8', errors='replace'))
+            
+        return AsyncCrawlResponse(
+            html=''.join(chunks),
+            response_headers={},
+            status_code=200
+        )
+
+    async def _handle_raw(self, content: str) -> AsyncCrawlResponse:
+        return AsyncCrawlResponse(
+            html=content,
+            response_headers={},
+            status_code=200
+        )
+
+
+    async def _handle_http(
+        self, 
+        url: str, 
+        config: CrawlerRunConfig
+    ) -> AsyncCrawlResponse:
+        async with self._session_context() as session:
+            timeout = ClientTimeout(
+                total=config.page_timeout or self.DEFAULT_TIMEOUT,
+                connect=10,
+                sock_read=30
+            )
+            
+            headers = dict(self._BASE_HEADERS)
+            if self.browser_config.headers:
+                headers.update(self.browser_config.headers)
+
+            request_kwargs = {
+                'timeout': timeout,
+                'allow_redirects': self.browser_config.follow_redirects,
+                'ssl': self.browser_config.verify_ssl,
+                'headers': headers
+            }
+
+            if self.browser_config.method == "POST":
+                if self.browser_config.data:
+                    request_kwargs['data'] = self.browser_config.data
+                if self.browser_config.json:
+                    request_kwargs['json'] = self.browser_config.json
+
+            await self.hooks['before_request'](url, request_kwargs)
+
+            try:
+                async with session.request(self.browser_config.method, url, **request_kwargs) as response:
+                    content = memoryview(await response.read())
+                    
+                    if not (200 <= response.status < 300):
+                        raise HTTPStatusError(
+                            response.status,
+                            f"Unexpected status code for {url}"
+                        )
+                    
+                    encoding = response.charset
+                    if not encoding:
+                        encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'                    
+                    
+                    result = AsyncCrawlResponse(
+                        html=content.tobytes().decode(encoding, errors='replace'),
+                        response_headers=dict(response.headers),
+                        status_code=response.status,
+                        redirected_url=str(response.url)
+                    )
+                    
+                    await self.hooks['after_request'](result)
+                    return result
+
+            except aiohttp.ServerTimeoutError as e:
+                await self.hooks['on_error'](e)
+                raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
+                
+            except aiohttp.ClientConnectorError as e:
+                await self.hooks['on_error'](e)
+                raise ConnectionError(f"Connection failed: {str(e)}")
+                
+            except aiohttp.ClientError as e:
+                await self.hooks['on_error'](e)
+                raise HTTPCrawlerError(f"HTTP client error: {str(e)}")
+            
+            except asyncio.exceptions.TimeoutError as e:
+                await self.hooks['on_error'](e)
+                raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
+            
+            except Exception as e:
+                await self.hooks['on_error'](e)
+                raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
+
+    async def crawl(
+        self, 
+        url: str, 
+        config: Optional[CrawlerRunConfig] = None, 
+        **kwargs
+    ) -> AsyncCrawlResponse:
+        config = config or CrawlerRunConfig.from_kwargs(kwargs)
+        
+        parsed = urlparse(url)
+        scheme = parsed.scheme.rstrip('/')
+        
+        if scheme not in self.VALID_SCHEMES:
+            raise ValueError(f"Unsupported URL scheme: {scheme}")
+            
+        try:
+            if scheme == 'file':
+                return await self._handle_file(parsed.path)
+            elif scheme == 'raw':
+                return await self._handle_raw(parsed.path)
+            else:  # http or https
+                return await self._handle_http(url, config)
+                
+        except Exception as e:
+            if self.logger:
+                self.logger.error(
+                    message="Crawl failed: {error}",
+                    tag="CRAWL",
+                    params={"error": str(e), "url": url}
+                )
+            raise