feat(crawler): add HTTP crawler strategy for lightweight web scraping

Implements a new AsyncHTTPCrawlerStrategy class that provides a fast, memory-efficient alternative to browser-based crawling. Features include: - Support for HTTP/HTTPS requests with configurable methods, headers, and timeouts - File and raw content handling capabilities - Streaming response processing for large files - Customizable request/response hooks - Comprehensive error handling Also refactors browser management code into separate module for better organization.
2025-02-15 19:26:30 +08:00
parent 063df572b0
commit 8bb799068e
7 changed files with 1353 additions and 851 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -2,7 +2,7 @@
 import warnings

 from .async_webcrawler import AsyncWebCrawler, CacheMode
-from .async_configs import BrowserConfig, CrawlerRunConfig
+from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig
 from .content_scraping_strategy import (
    ContentScrapingStrategy,
    WebScrapingStrategy,
@@ -70,6 +70,7 @@ __all__ = [
    "LXMLWebScrapingStrategy",
    "BrowserConfig",
    "CrawlerRunConfig",
+    "HTTPCrawlerConfig",
    "ExtractionStrategy",
    "LLMExtractionStrategy",
    "CosineStrategy",