Files
crawl4ai/tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py
UncleCode 8bb799068e feat(crawler): add HTTP crawler strategy for lightweight web scraping
Implements a new AsyncHTTPCrawlerStrategy class that provides a fast, memory-efficient alternative to browser-based crawling. Features include:
- Support for HTTP/HTTPS requests with configurable methods, headers, and timeouts
- File and raw content handling capabilities
- Streaming response processing for large files
- Customizable request/response hooks
- Comprehensive error handling

Also refactors browser management code into separate module for better organization.
2025-02-15 19:26:30 +08:00

56 lines
1.8 KiB
Python

import asyncio
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
HTTPCrawlerConfig,
CacheMode,
DefaultMarkdownGenerator,
PruningContentFilter
)
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
from crawl4ai.async_logger import AsyncLogger
async def main():
# Initialize HTTP crawler strategy
http_strategy = AsyncHTTPCrawlerStrategy(
browser_config=HTTPCrawlerConfig(
method="GET",
verify_ssl=True,
follow_redirects=True
),
logger=AsyncLogger(verbose=True)
)
# Initialize web crawler with HTTP strategy
async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=0
)
)
)
# Test different URLs
urls = [
"https://example.com",
"https://httpbin.org/get",
"raw://<html><body>Test content</body></html>"
]
for url in urls:
print(f"\n=== Testing {url} ===")
try:
result = await crawler.arun(url=url, config=crawler_config)
print(f"Status: {result.status_code}")
print(f"Raw HTML length: {len(result.html)}")
if hasattr(result, 'markdown_v2'):
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
asyncio.run(main())