From dee5fe9851c1a38225531b53f82bdae3aaf5f33b Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 22 Jan 2025 16:11:01 +0800
Subject: [PATCH] feat(proxy): add proxy rotation support and documentation

Implements dynamic proxy rotation functionality with authentication support and IP verification. Updates include:
- Added proxy rotation demo in features example
- Updated proxy configuration handling in BrowserManager
- Added proxy rotation documentation
- Updated README with new proxy rotation feature
- Bumped version to 0.4.3b2

This change enables users to dynamically switch between proxies and verify IP addresses for each request.
---
 README.md                             |  1 +
 crawl4ai/__version__.py               |  2 +-
 crawl4ai/async_crawler_strategy.py    |  7 +++-
 docs/examples/v0_4_3_features_demo.py | 60 +++++++++++++++++++++++++++
 docs/md_v2/advanced/proxy-security.md |  2 +-
 5 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 66a652ff..9cfe4512 100644
--- a/README.md
+++ b/README.md
@@ -491,6 +491,7 @@ async def test_news_crawl():
     -   **Ollama Support**: Use open-source or self-hosted models for private or cost-effective extraction.
 -   **🏎️ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental).
 -   **🤖 robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching.
+-   **🔄 Proxy Rotation**: Built-in support for dynamic proxy switching and IP verification, with support for authenticated proxies and session persistence.
 -   **➡️ URL Redirection Tracking**: The `final_url` field now captures the final destination after any redirects.
 -   **🪞 Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites.
 -   **📈 Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`.
diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index 5d2b86af..a0acc761 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.4.3b1"
+__version__ = "0.4.3b2"
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index ae1788f1..a2bb7b96 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -633,9 +633,12 @@ class BrowserManager:
             if crawlerRunConfig.proxy_config:
                 proxy_settings = {
                     "server": crawlerRunConfig.proxy_config.get("server"),
-                    "username": crawlerRunConfig.proxy_config.get("username"),
-                    "password": crawlerRunConfig.proxy_config.get("password"),
                 }
+                if crawlerRunConfig.proxy_config.get("username"):
+                    proxy_settings.update({
+                        "username": crawlerRunConfig.proxy_config.get("username"),
+                        "password": crawlerRunConfig.proxy_config.get("password"),
+                    })
                 context_settings["proxy"] = proxy_settings
 
         if self.config.text_mode:
diff --git a/docs/examples/v0_4_3_features_demo.py b/docs/examples/v0_4_3_features_demo.py
index 2ffaa172..033bf30f 100644
--- a/docs/examples/v0_4_3_features_demo.py
+++ b/docs/examples/v0_4_3_features_demo.py
@@ -233,6 +233,64 @@ async def demo_llm_schema_generation():
             print("Successfully used generated schema for crawling")
 
 
+async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
+    """Get next proxy from local file"""
+    try:
+        with open(proxy_file) as f:
+            proxies = f.read().splitlines()
+            if not proxies:
+                return None
+            
+        ip, port, username, password = random.choice(proxies).split(":")
+        return {
+            "server": f"http://{ip}:{port}",
+            "username": username,
+            "password": password,
+            "ip": ip  # Store original IP for verification
+        }
+    except Exception as e:
+        print(f"Error loading proxy: {e}")
+        return None
+
+async def demo_proxy_rotation():
+    """
+    8. Proxy Rotation Demo
+    ===================
+    Demonstrates how to rotate proxies for each request using Crawl4ai.
+    """
+    print("\n=== 8. Proxy Rotation Demo ===")
+    
+    
+    # Create 10 test requests to httpbin
+    urls = ["https://httpbin.org/ip"] * 3
+    
+    browser_config = BrowserConfig(headless=True)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        for url in urls:
+            proxy = await get_next_proxy()
+            if not proxy:
+                print("No proxy available, skipping...")
+                continue
+                
+            # Create new config with proxy
+            current_config = run_config.clone(proxy_config=proxy)
+            result = await crawler.arun(url=url, config=current_config)
+            
+            if result.success:
+                ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+                print(f"Proxy {proxy['ip']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
+                verified = ip_match.group(0) == proxy['ip']
+                if verified:
+                    print(f"✅ Proxy working! IP matches: {proxy['ip']}")
+                else:
+                    print(f"❌ Proxy failed or IP mismatch!")
+            else:
+                print(f"Failed with proxy {proxy['ip']}")
+
+if __name__ == "__main__":
+
 async def main():
     """Run all feature demonstrations."""
     demo_memory_dispatcher(),
@@ -247,6 +305,8 @@ async def main():
     print("\n" + "=" * 50 + "\n")
     demo_robots_compliance(),
     print("\n" + "=" * 50 + "\n")
+    demo_proxy_rotation()
+    print("\n" + "=" * 50 + "\n")
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md
index 9b64fd84..0e56572c 100644
--- a/docs/md_v2/advanced/proxy-security.md
+++ b/docs/md_v2/advanced/proxy-security.md
@@ -38,7 +38,7 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
 
 Here's the corrected documentation:
 
-## Rotating Proxies [COMING SOON]
+## Rotating Proxies 
 
 Example using a proxy rotation service dynamically: