From dee5fe9851c1a38225531b53f82bdae3aaf5f33b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 22 Jan 2025 16:11:01 +0800 Subject: [PATCH] feat(proxy): add proxy rotation support and documentation Implements dynamic proxy rotation functionality with authentication support and IP verification. Updates include: - Added proxy rotation demo in features example - Updated proxy configuration handling in BrowserManager - Added proxy rotation documentation - Updated README with new proxy rotation feature - Bumped version to 0.4.3b2 This change enables users to dynamically switch between proxies and verify IP addresses for each request. --- README.md | 1 + crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 7 +++- docs/examples/v0_4_3_features_demo.py | 60 +++++++++++++++++++++++++++ docs/md_v2/advanced/proxy-security.md | 2 +- 5 files changed, 68 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 66a652ff..9cfe4512 100644 --- a/README.md +++ b/README.md @@ -491,6 +491,7 @@ async def test_news_crawl(): - **Ollama Support**: Use open-source or self-hosted models for private or cost-effective extraction. - **🏎️ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental). - **🤖 robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching. +- **🔄 Proxy Rotation**: Built-in support for dynamic proxy switching and IP verification, with support for authenticated proxies and session persistence. - **➡️ URL Redirection Tracking**: The `final_url` field now captures the final destination after any redirects. - **🪞 Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites. - **📈 Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`. diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 5d2b86af..a0acc761 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.3b1" +__version__ = "0.4.3b2" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index ae1788f1..a2bb7b96 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -633,9 +633,12 @@ class BrowserManager: if crawlerRunConfig.proxy_config: proxy_settings = { "server": crawlerRunConfig.proxy_config.get("server"), - "username": crawlerRunConfig.proxy_config.get("username"), - "password": crawlerRunConfig.proxy_config.get("password"), } + if crawlerRunConfig.proxy_config.get("username"): + proxy_settings.update({ + "username": crawlerRunConfig.proxy_config.get("username"), + "password": crawlerRunConfig.proxy_config.get("password"), + }) context_settings["proxy"] = proxy_settings if self.config.text_mode: diff --git a/docs/examples/v0_4_3_features_demo.py b/docs/examples/v0_4_3_features_demo.py index 2ffaa172..033bf30f 100644 --- a/docs/examples/v0_4_3_features_demo.py +++ b/docs/examples/v0_4_3_features_demo.py @@ -233,6 +233,64 @@ async def demo_llm_schema_generation(): print("Successfully used generated schema for crawling") +async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]: + """Get next proxy from local file""" + try: + with open(proxy_file) as f: + proxies = f.read().splitlines() + if not proxies: + return None + + ip, port, username, password = random.choice(proxies).split(":") + return { + "server": f"http://{ip}:{port}", + "username": username, + "password": password, + "ip": ip # Store original IP for verification + } + except Exception as e: + print(f"Error loading proxy: {e}") + return None + +async def demo_proxy_rotation(): + """ + 8. Proxy Rotation Demo + =================== + Demonstrates how to rotate proxies for each request using Crawl4ai. + """ + print("\n=== 8. Proxy Rotation Demo ===") + + + # Create 10 test requests to httpbin + urls = ["https://httpbin.org/ip"] * 3 + + browser_config = BrowserConfig(headless=True) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + + async with AsyncWebCrawler(config=browser_config) as crawler: + for url in urls: + proxy = await get_next_proxy() + if not proxy: + print("No proxy available, skipping...") + continue + + # Create new config with proxy + current_config = run_config.clone(proxy_config=proxy) + result = await crawler.arun(url=url, config=current_config) + + if result.success: + ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html) + print(f"Proxy {proxy['ip']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}") + verified = ip_match.group(0) == proxy['ip'] + if verified: + print(f"✅ Proxy working! IP matches: {proxy['ip']}") + else: + print(f"❌ Proxy failed or IP mismatch!") + else: + print(f"Failed with proxy {proxy['ip']}") + +if __name__ == "__main__": + async def main(): """Run all feature demonstrations.""" demo_memory_dispatcher(), @@ -247,6 +305,8 @@ async def main(): print("\n" + "=" * 50 + "\n") demo_robots_compliance(), print("\n" + "=" * 50 + "\n") + demo_proxy_rotation() + print("\n" + "=" * 50 + "\n") if __name__ == "__main__": asyncio.run(main()) diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md index 9b64fd84..0e56572c 100644 --- a/docs/md_v2/advanced/proxy-security.md +++ b/docs/md_v2/advanced/proxy-security.md @@ -38,7 +38,7 @@ async with AsyncWebCrawler(config=browser_config) as crawler: Here's the corrected documentation: -## Rotating Proxies [COMING SOON] +## Rotating Proxies Example using a proxy rotation service dynamically: