feat(proxy): add proxy rotation strategy

Implements a new proxy rotation system with the following changes: - Add ProxyRotationStrategy abstract base class - Add RoundRobinProxyStrategy concrete implementation - Integrate proxy rotation with AsyncWebCrawler - Add proxy_rotation_strategy parameter to CrawlerRunConfig - Add example script demonstrating proxy rotation usage - Remove deprecated synchronous WebCrawler code - Clean up rate limiting documentation BREAKING CHANGE: Removed synchronous WebCrawler support and related rate limiting configurations
2025-02-09 18:49:10 +08:00
parent b957ff2ecd
commit 19df96ed56
12 changed files with 257 additions and 162 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -8,6 +8,10 @@ from .content_scraping_strategy import (
    WebScrapingStrategy,
    LXMLWebScrapingStrategy,
 )
+from .proxy_strategy import (
+    ProxyRotationStrategy,
+    RoundRobinProxyStrategy,
+)
 from .extraction_strategy import (
    ExtractionStrategy,
    LLMExtractionStrategy,
@@ -60,31 +64,33 @@ __all__ = [
    "DisplayMode",
    "MarkdownGenerationResult",
    "Crawl4aiDockerClient",
+    "ProxyRotationStrategy",
+    "RoundRobinProxyStrategy",
 ]


-def is_sync_version_installed():
-    try:
-        import selenium # noqa
+# def is_sync_version_installed():
+#     try:
+#         import selenium # noqa

-        return True
-    except ImportError:
-        return False
+#         return True
+#     except ImportError:
+#         return False


-if is_sync_version_installed():
-    try:
-        from .web_crawler import WebCrawler
+# if is_sync_version_installed():
+#     try:
+#         from .web_crawler import WebCrawler

-        __all__.append("WebCrawler")
-    except ImportError:
-        print(
-            "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
-        )
-else:
-    WebCrawler = None
-    # import warnings
-    # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
+#         __all__.append("WebCrawler")
+#     except ImportError:
+#         print(
+#             "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
+#         )
+# else:
+#     WebCrawler = None
+#     # import warnings
+#     # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")

 # Disable all Pydantic warnings
 warnings.filterwarnings("ignore", module="pydantic")
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -16,6 +16,7 @@ from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrat
 from .deep_crawling import DeepCrawlStrategy
 from typing import Union, List
 from .cache_context import CacheMode
+from .proxy_strategy import ProxyRotationStrategy

 import inspect
 from typing import Any, Dict, Optional
@@ -542,6 +543,7 @@ class CrawlerRunConfig():
        parser_type: str = "lxml",
        scraping_strategy: ContentScrapingStrategy = None,
        proxy_config: dict = None,
+        proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
        # SSL Parameters
        fetch_ssl_certificate: bool = False,
        # Caching Parameters
@@ -620,6 +622,7 @@ class CrawlerRunConfig():
        self.parser_type = parser_type
        self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
        self.proxy_config = proxy_config
+        self.proxy_rotation_strategy = proxy_rotation_strategy

        # SSL Parameters
        self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -731,6 +734,7 @@ class CrawlerRunConfig():
            parser_type=kwargs.get("parser_type", "lxml"),
            scraping_strategy=kwargs.get("scraping_strategy"),
            proxy_config=kwargs.get("proxy_config"),
+            proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
            # SSL Parameters
            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
            # Caching Parameters
@@ -827,6 +831,7 @@ class CrawlerRunConfig():
            "parser_type": self.parser_type,
            "scraping_strategy": self.scraping_strategy,
            "proxy_config": self.proxy_config,
+            "proxy_rotation_strategy": self.proxy_rotation_strategy,
            "fetch_ssl_certificate": self.fetch_ssl_certificate,
            "cache_mode": self.cache_mode,
            "session_id": self.session_id,
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -394,6 +394,19 @@ class AsyncWebCrawler:
                        tag="FETCH",
                    )

+                # Update proxy configuration from rotation strategy if available
+                if config and config.proxy_rotation_strategy:
+                    next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
+                    if next_proxy:
+                        if verbose:
+                            self.logger.info(
+                                message="Switch proxy: {proxy}",
+                                tag="PROXY",
+                                params={"proxy": next_proxy.get("server")},
+                            )
+                        config.proxy_config = next_proxy
+                        # config = config.clone(proxy_config=next_proxy)
+
                # Fetch fresh content if needed
                if not cached_result or not html:
                    t1 = time.perf_counter()
--- a/crawl4ai/proxy_strategy.py
+++ b/crawl4ai/proxy_strategy.py
@@ -0,0 +1,43 @@
+from typing import List, Dict, Optional
+from abc import ABC, abstractmethod
+from itertools import cycle
+
+class ProxyRotationStrategy(ABC):
+    """Base abstract class for proxy rotation strategies"""
+    
+    @abstractmethod
+    async def get_next_proxy(self) -> Optional[Dict]:
+        """Get next proxy configuration from the strategy"""
+        pass
+
+    @abstractmethod
+    def add_proxies(self, proxies: List[Dict]):
+        """Add proxy configurations to the strategy"""
+        pass
+
+class RoundRobinProxyStrategy(ProxyRotationStrategy):
+    """Simple round-robin proxy rotation strategy"""
+
+    def __init__(self, proxies: List[Dict] = None):
+        """
+        Initialize with optional list of proxy configurations
+        
+        Args:
+            proxies: List of proxy config dictionaries, each containing at least
+                    'server' key with proxy URL
+        """
+        self._proxies = []
+        self._proxy_cycle = None
+        if proxies:
+            self.add_proxies(proxies)
+
+    def add_proxies(self, proxies: List[Dict]):
+        """Add new proxies to the rotation pool"""
+        self._proxies.extend(proxies)
+        self._proxy_cycle = cycle(self._proxies)
+
+    async def get_next_proxy(self) -> Optional[Dict]:
+        """Get next proxy in round-robin fashion"""
+        if not self._proxy_cycle:
+            return None
+        return next(self._proxy_cycle)