feat(crawler): add HTTP crawler strategy for lightweight web scraping

Implements a new AsyncHTTPCrawlerStrategy class that provides a fast, memory-efficient alternative to browser-based crawling. Features include: - Support for HTTP/HTTPS requests with configurable methods, headers, and timeouts - File and raw content handling capabilities - Streaming response processing for large files - Customizable request/response hooks - Comprehensive error handling Also refactors browser management code into separate module for better organization.
2025-02-15 19:26:30 +08:00
parent 063df572b0
commit 8bb799068e
7 changed files with 1353 additions and 851 deletions
--- a/tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py
+++ b/tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py
@@ -0,0 +1,56 @@
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    HTTPCrawlerConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter
+)
+from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
+from crawl4ai.async_logger import AsyncLogger
+
+async def main():
+    # Initialize HTTP crawler strategy
+    http_strategy = AsyncHTTPCrawlerStrategy(
+        browser_config=HTTPCrawlerConfig(
+            method="GET",
+            verify_ssl=True,
+            follow_redirects=True
+        ),
+        logger=AsyncLogger(verbose=True)
+    )
+
+    # Initialize web crawler with HTTP strategy
+    async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(
+                    threshold=0.48, 
+                    threshold_type="fixed", 
+                    min_word_threshold=0
+                )
+            )
+        )
+        
+        # Test different URLs
+        urls = [
+            "https://example.com",
+            "https://httpbin.org/get",
+            "raw://<html><body>Test content</body></html>"
+        ]
+        
+        for url in urls:
+            print(f"\n=== Testing {url} ===")
+            try:
+                result = await crawler.arun(url=url, config=crawler_config)
+                print(f"Status: {result.status_code}")
+                print(f"Raw HTML length: {len(result.html)}")
+                if hasattr(result, 'markdown_v2'):
+                    print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
+            except Exception as e:
+                print(f"Error: {e}")
+
+if __name__ == "__main__":
+    asyncio.run(main())