feat(proxy): add proxy rotation strategy

Implements a new proxy rotation system with the following changes: - Add ProxyRotationStrategy abstract base class - Add RoundRobinProxyStrategy concrete implementation - Integrate proxy rotation with AsyncWebCrawler - Add proxy_rotation_strategy parameter to CrawlerRunConfig - Add example script demonstrating proxy rotation usage - Remove deprecated synchronous WebCrawler code - Clean up rate limiting documentation BREAKING CHANGE: Removed synchronous WebCrawler support and related rate limiting configurations
2025-02-09 18:49:10 +08:00
parent b957ff2ecd
commit 19df96ed56
12 changed files with 257 additions and 162 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -8,6 +8,10 @@ from .content_scraping_strategy import (
    WebScrapingStrategy,
    LXMLWebScrapingStrategy,
 )
 from .proxy_strategy import (
    ProxyRotationStrategy,
    RoundRobinProxyStrategy,
 )
 from .extraction_strategy import (
    ExtractionStrategy,
    LLMExtractionStrategy,
@@ -60,31 +64,33 @@ __all__ = [
    "DisplayMode",
    "MarkdownGenerationResult",
    "Crawl4aiDockerClient",
    "ProxyRotationStrategy",
    "RoundRobinProxyStrategy",
 ]
-def is_sync_version_installed():
+# def is_sync_version_installed():
-    try:
+#     try:
-        import selenium # noqa
+#         import selenium # noqa
-        return True
+#         return True
-    except ImportError:
+#     except ImportError:
-        return False
+#         return False
-if is_sync_version_installed():
+# if is_sync_version_installed():
-    try:
+#     try:
-        from .web_crawler import WebCrawler
+#         from .web_crawler import WebCrawler
-        __all__.append("WebCrawler")
+#         __all__.append("WebCrawler")
-    except ImportError:
+#     except ImportError:
-        print(
+#         print(
-            "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
+#             "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
-        )
+#         )
-else:
+# else:
-    WebCrawler = None
+#     WebCrawler = None
-    # import warnings
+#     # import warnings
-    # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
+#     # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
 # Disable all Pydantic warnings
 warnings.filterwarnings("ignore", module="pydantic")
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -16,6 +16,7 @@ from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrat
 from .deep_crawling import DeepCrawlStrategy
 from typing import Union, List
 from .cache_context import CacheMode
 from .proxy_strategy import ProxyRotationStrategy
 import inspect
 from typing import Any, Dict, Optional
@@ -542,6 +543,7 @@ class CrawlerRunConfig():
        parser_type: str = "lxml",
        scraping_strategy: ContentScrapingStrategy = None,
        proxy_config: dict = None,
        proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
        # SSL Parameters
        fetch_ssl_certificate: bool = False,
        # Caching Parameters
@@ -620,6 +622,7 @@ class CrawlerRunConfig():
        self.parser_type = parser_type
        self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
        self.proxy_config = proxy_config
        self.proxy_rotation_strategy = proxy_rotation_strategy
        # SSL Parameters
        self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -731,6 +734,7 @@ class CrawlerRunConfig():
            parser_type=kwargs.get("parser_type", "lxml"),
            scraping_strategy=kwargs.get("scraping_strategy"),
            proxy_config=kwargs.get("proxy_config"),
            proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
            # SSL Parameters
            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
            # Caching Parameters
@@ -827,6 +831,7 @@ class CrawlerRunConfig():
            "parser_type": self.parser_type,
            "scraping_strategy": self.scraping_strategy,
            "proxy_config": self.proxy_config,
            "proxy_rotation_strategy": self.proxy_rotation_strategy,
            "fetch_ssl_certificate": self.fetch_ssl_certificate,
            "cache_mode": self.cache_mode,
            "session_id": self.session_id,
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -394,6 +394,19 @@ class AsyncWebCrawler:
                        tag="FETCH",
                    )
                # Update proxy configuration from rotation strategy if available
                if config and config.proxy_rotation_strategy:
                    next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
                    if next_proxy:
                        if verbose:
                            self.logger.info(
                                message="Switch proxy: {proxy}",
                                tag="PROXY",
                                params={"proxy": next_proxy.get("server")},
                            )
                        config.proxy_config = next_proxy
                        # config = config.clone(proxy_config=next_proxy)
                # Fetch fresh content if needed
                if not cached_result or not html:
                    t1 = time.perf_counter()
--- a/crawl4ai/proxy_strategy.py
+++ b/crawl4ai/proxy_strategy.py
@@ -0,0 +1,43 @@
 from typing import List, Dict, Optional
 from abc import ABC, abstractmethod
 from itertools import cycle
 class ProxyRotationStrategy(ABC):
    """Base abstract class for proxy rotation strategies"""
    @abstractmethod
    async def get_next_proxy(self) -> Optional[Dict]:
        """Get next proxy configuration from the strategy"""
        pass
    @abstractmethod
    def add_proxies(self, proxies: List[Dict]):
        """Add proxy configurations to the strategy"""
        pass
 class RoundRobinProxyStrategy(ProxyRotationStrategy):
    """Simple round-robin proxy rotation strategy"""
    def __init__(self, proxies: List[Dict] = None):
        """
        Initialize with optional list of proxy configurations
        Args:
            proxies: List of proxy config dictionaries, each containing at least
                    'server' key with proxy URL
        """
        self._proxies = []
        self._proxy_cycle = None
        if proxies:
            self.add_proxies(proxies)
    def add_proxies(self, proxies: List[Dict]):
        """Add new proxies to the rotation pool"""
        self._proxies.extend(proxies)
        self._proxy_cycle = cycle(self._proxies)
    async def get_next_proxy(self) -> Optional[Dict]:
        """Get next proxy in round-robin fashion"""
        if not self._proxy_cycle:
            return None
        return next(self._proxy_cycle)
--- a/docs/examples/proxy_rotation_demo.py
+++ b/docs/examples/proxy_rotation_demo.py
@@ -0,0 +1,161 @@
 import os
 import re
 from typing import List, Dict
 from crawl4ai import (
    AsyncWebCrawler,
    BrowserConfig,
    CrawlerRunConfig,
    CacheMode,
    RoundRobinProxyStrategy
 )
 def load_proxies_from_env() -> List[Dict]:
    """Load proxies from PROXIES environment variable"""
    proxies = []
    try:
        proxy_list = os.getenv("PROXIES", "").split(",")
        for proxy in proxy_list:
            if not proxy:
                continue
            ip, port, username, password = proxy.split(":")
            proxies.append({
                "server": f"http://{ip}:{port}",
                "username": username,
                "password": password,
                "ip": ip  # Store original IP for verification
            })
    except Exception as e:
        print(f"Error loading proxies from environment: {e}")
    return proxies
 async def demo_proxy_rotation():
    """
    Proxy Rotation Demo using RoundRobinProxyStrategy
    ===============================================
    Demonstrates proxy rotation using the strategy pattern.
    """
    print("\n=== Proxy Rotation Demo (Round Robin) ===")
    # Load proxies and create rotation strategy
    proxies = load_proxies_from_env()
    if not proxies:
        print("No proxies found in environment. Set PROXIES env variable!")
        return
    proxy_strategy = RoundRobinProxyStrategy(proxies)
    # Create configs
    browser_config = BrowserConfig(headless=True, verbose=False)
    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        proxy_rotation_strategy=proxy_strategy
    )
    # Test URLs
    urls = ["https://httpbin.org/ip"] * len(proxies)  # Test each proxy once
    async with AsyncWebCrawler(config=browser_config) as crawler:
        for url in urls:
            result = await crawler.arun(url=url, config=run_config)
            if result.success:
                # Extract IP from response
                ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
                current_proxy = run_config.proxy_config if run_config.proxy_config else None
                if current_proxy:
                    print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
                    verified = ip_match and ip_match.group(0) == current_proxy['ip']
                    if verified:
                        print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
                    else:
                        print("❌ Proxy failed or IP mismatch!")
            else:
                print(f"Request failed: {result.error_message}")
 async def demo_proxy_rotation_batch():
    """
    Proxy Rotation Demo with Batch Processing
    =======================================
    Demonstrates proxy rotation using arun_many with memory dispatcher.
    """
    print("\n=== Proxy Rotation Batch Demo ===")
    try:
        # Load proxies and create rotation strategy
        proxies = load_proxies_from_env()
        if not proxies:
            print("No proxies found in environment. Set PROXIES env variable!")
            return
        proxy_strategy = RoundRobinProxyStrategy(proxies)
        # Configurations
        browser_config = BrowserConfig(headless=True, verbose=False)
        run_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            proxy_rotation_strategy=proxy_strategy,
            markdown_generator=DefaultMarkdownGenerator()
        )
        # Test URLs - multiple requests to test rotation
        urls = ["https://httpbin.org/ip"] * (len(proxies) * 2)  # Test each proxy twice
        print("\n📈 Initializing crawler with proxy rotation...")
        async with AsyncWebCrawler(config=browser_config) as crawler:
            monitor = CrawlerMonitor(
                max_visible_rows=10,
                display_mode=DisplayMode.DETAILED
            )
            dispatcher = MemoryAdaptiveDispatcher(
                memory_threshold_percent=80.0,
                check_interval=0.5,
                max_session_permit=1, #len(proxies),  # Match concurrent sessions to proxy count
                # monitor=monitor
            )
            print("\n🚀 Starting batch crawl with proxy rotation...")
            results = await crawler.arun_many(
                urls=urls,
                config=run_config,
                dispatcher=dispatcher
            )
            # Verify results
            success_count = 0
            for result in results:
                if result.success:
                    ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
                    current_proxy = run_config.proxy_config if run_config.proxy_config else None
                    if current_proxy and ip_match:
                        print(f"URL {result.url}")
                        print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0)}")
                        verified = ip_match.group(0) == current_proxy['ip']
                        if verified:
                            print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
                            success_count += 1
                        else:
                            print("❌ Proxy failed or IP mismatch!")
                    print("---")
            print(f"\n✅ Completed {len(results)} requests with {success_count} successful proxy verifications")
    except Exception as e:
        print(f"\n❌ Error in proxy rotation batch demo: {str(e)}")
 if __name__ == "__main__":
    import asyncio
    from crawl4ai import (
        CrawlerMonitor, 
        DisplayMode,
        MemoryAdaptiveDispatcher,
        DefaultMarkdownGenerator
    )
    async def run_demos():
        # await demo_proxy_rotation()  # Original single-request demo
        await demo_proxy_rotation_batch()  # New batch processing demo
    asyncio.run(run_demos())
--- a/docs/md_v2/api/async-webcrawler.md
+++ b/docs/md_v2/api/async-webcrawler.md
@@ -160,41 +160,9 @@ The `arun_many()` method now uses an intelligent dispatcher that:
 ### 4.2 Example Usage
 Check page [Multi-url Crawling](../advanced/multi-url-crawling.md) for a detailed example of how to use `arun_many()`.
 ```python
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, RateLimitConfig
 from crawl4ai.dispatcher import DisplayMode
 # Configure browser
 browser_cfg = BrowserConfig(headless=True)
 # Configure crawler with rate limiting
 run_cfg = CrawlerRunConfig(
    # Enable rate limiting
    enable_rate_limiting=True,
    rate_limit_config=RateLimitConfig(
        base_delay=(1.0, 2.0),  # Random delay between 1-2 seconds
        max_delay=30.0,         # Maximum delay after rate limit hits
        max_retries=2,          # Number of retries before giving up
        rate_limit_codes=[429, 503]  # Status codes that trigger rate limiting
    ),
    # Resource monitoring
    memory_threshold_percent=70.0,  # Pause if memory exceeds this
    check_interval=0.5,            # How often to check resources
    max_session_permit=3,          # Maximum concurrent crawls
    display_mode=DisplayMode.DETAILED.value  # Show detailed progress
 )
 urls = [
    "https://example.com/page1",
    "https://example.com/page2",
    "https://example.com/page3"
 ]
 async with AsyncWebCrawler(config=browser_cfg) as crawler:
    results = await crawler.arun_many(urls, config=run_cfg)
    for result in results:
        print(f"URL: {result.url}, Success: {result.success}")
 ```
 ### 4.3 Key Features
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -159,32 +159,7 @@ Use these for link-level content filtering (often to keep crawls “internal”
 ---
-### G) **Rate Limiting & Resource Management**
+### G) **Debug & Logging**
 | **Parameter**                | **Type / Default**                     | **What It Does**                                                                                                           |
 |------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
 | **`enable_rate_limiting`**  | `bool` (default: `False`)              | Enable intelligent rate limiting for multiple URLs                                                                          |
 | **`rate_limit_config`**     | `RateLimitConfig` (default: `None`)    | Configuration for rate limiting behavior                                                                                   |
 The `RateLimitConfig` class has these fields:
 | **Field**           | **Type / Default**                     | **What It Does**                                                                                                           |
 |--------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
 | **`base_delay`**   | `Tuple[float, float]` (1.0, 3.0)      | Random delay range between requests to the same domain                                                                      |
 | **`max_delay`**    | `float` (60.0)                        | Maximum delay after rate limit detection                                                                                    |
 | **`max_retries`**  | `int` (3)                             | Number of retries before giving up on rate-limited requests                                                                 |
 | **`rate_limit_codes`** | `List[int]` ([429, 503])          | HTTP status codes that trigger rate limiting behavior                                                                       |
 | **Parameter**                  | **Type / Default**                     | **What It Does**                                                                                                           |
 |-------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
 | **`memory_threshold_percent`** | `float` (70.0)                        | Maximum memory usage before pausing new crawls                                                                              |
 | **`check_interval`**          | `float` (1.0)                         | How often to check system resources (in seconds)                                                                           |
 | **`max_session_permit`**      | `int` (20)                            | Maximum number of concurrent crawl sessions                                                                                |
 | **`display_mode`**            | `str` (`None`, "DETAILED", "AGGREGATED") | How to display progress information                                                                                     |
 ---
 ### H) **Debug & Logging**
 | **Parameter**  | **Type / Default** | **What It Does**                                                         |
 |----------------|--------------------|---------------------------------------------------------------------------|
@@ -218,7 +193,7 @@ The `clone()` method is particularly useful when you need slightly different con
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, RateLimitConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 async def main():
    # Configure the browser
@@ -239,17 +214,6 @@ async def main():
        exclude_external_links=True,
        wait_for="css:.article-loaded",
        screenshot=True,
        enable_rate_limiting=True,
        rate_limit_config=RateLimitConfig(
            base_delay=(1.0, 3.0),
            max_delay=60.0,
            max_retries=3,
            rate_limit_codes=[429, 503]
        ),
        memory_threshold_percent=70.0,
        check_interval=1.0,
        max_session_permit=20,
        display_mode="DETAILED",
        stream=True
    )
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -186,23 +186,19 @@ class CrawlerRunConfig:
   - If `True`, enables rate limiting for batch processing.  
   - Requires `rate_limit_config` to be set.
-10. **`rate_limit_config`**:  
+10. **`memory_threshold_percent`**:  
    - A `RateLimitConfig` object controlling rate limiting behavior.  
    - See below for details.
 11. **`memory_threshold_percent`**:  
    - The memory threshold (as a percentage) to monitor.  
    - If exceeded, the crawler will pause or slow down.
-12. **`check_interval`**:  
+11. **`check_interval`**:  
    - The interval (in seconds) to check system resources.  
    - Affects how often memory and CPU usage are monitored.
-13. **`max_session_permit`**:  
+12. **`max_session_permit`**:  
    - The maximum number of concurrent crawl sessions.  
    - Helps prevent overwhelming the system.
-14. **`display_mode`**:  
+13. **`display_mode`**:  
    - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).  
    - Affects how much information is printed during the crawl.
@@ -236,58 +232,6 @@ The `clone()` method:
 - Leaves the original configuration unchanged
 - Perfect for creating variations without repeating all parameters
 ### Rate Limiting & Resource Management
 For batch processing with `arun_many()`, you can enable intelligent rate limiting:
 ```python
 from crawl4ai import RateLimitConfig
 config = CrawlerRunConfig(
    enable_rate_limiting=True,
    rate_limit_config=RateLimitConfig(
        base_delay=(1.0, 3.0),    # Random delay range
        max_delay=60.0,           # Max delay after rate limits
        max_retries=3,            # Retries before giving up
        rate_limit_codes=[429, 503]  # Status codes to watch
    ),
    memory_threshold_percent=70.0,  # Memory threshold
    check_interval=1.0,            # Resource check interval
    max_session_permit=20,         # Max concurrent crawls
    display_mode="DETAILED"        # Progress display mode
 )
 ```
 This configuration:
 - Implements intelligent rate limiting per domain
 - Monitors system resources
 - Provides detailed progress information
 - Manages concurrent crawls efficiently
 **Minimal Example**:
 ```python
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 crawl_conf = CrawlerRunConfig(
    js_code="document.querySelector('button#loadMore')?.click()",
    wait_for="css:.loaded-content",
    screenshot=True,
    enable_rate_limiting=True,
    rate_limit_config=RateLimitConfig(
        base_delay=(1.0, 3.0),
        max_delay=60.0,
        max_retries=3,
        rate_limit_codes=[429, 503]
    ),
    stream=True  # Enable streaming
 )
 async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(url="https://example.com", config=crawl_conf)
    print(result.screenshot[:100])  # Base64-encoded PNG snippet
 ```
 ---
 ## 3. Putting It All Together
@@ -322,13 +266,6 @@ async def main():
    run_conf = CrawlerRunConfig(
        extraction_strategy=extraction,
        cache_mode=CacheMode.BYPASS,
        enable_rate_limiting=True,
        rate_limit_config=RateLimitConfig(
            base_delay=(1.0, 3.0),
            max_delay=60.0,
            max_retries=3,
            rate_limit_codes=[429, 503]
        )
    )
    async with AsyncWebCrawler(config=browser_conf) as crawler:
--- a/docs/releases_review/Crawl4AI_v0.3.72_Release_Announcement.ipynb
+++ b/docs/releases_review/Crawl4AI_v0.3.72_Release_Announcement.ipynb
--- a/docs/releases_review/v0.3.74.overview.py
+++ b/docs/releases_review/v0.3.74.overview.py
--- a/docs/releases_review/v0_4_24_walkthrough.py
+++ b/docs/releases_review/v0_4_24_walkthrough.py
--- a/docs/releases_review/v0_4_3b2_features_demo.py
+++ b/docs/releases_review/v0_4_3b2_features_demo.py
@@ -31,9 +31,6 @@ import re
 import random
 from typing import Optional, Dict
 from dotenv import load_dotenv
 load_dotenv()
 from crawl4ai import (
    AsyncWebCrawler, 
    BrowserConfig,
@@ -48,6 +45,7 @@ from crawl4ai import (
    LLMContentFilter
 )
 load_dotenv()
 async def demo_memory_dispatcher():
    """Demonstrates the new memory-efficient dispatcher system.
@@ -283,7 +281,7 @@ async def demo_proxy_rotation():
    """
    print("\n=== 8. Proxy Rotation Demo ===")
-    async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
+    async def get_next_proxy(proxy_file: str = "proxies.txt") -> Optional[Dict]:
        """Get next proxy from local file"""
        try:
            proxies = os.getenv("PROXIES", "").split(",")
@@ -323,7 +321,7 @@ async def demo_proxy_rotation():
                if verified:
                    print(f"✅ Proxy working! IP matches: {proxy['ip']}")
                else:
-                    print(f"❌ Proxy failed or IP mismatch!")
+                    print("❌ Proxy failed or IP mismatch!")
            else:
                print(f"Failed with proxy {proxy['ip']}")