feat(proxy): add proxy rotation strategy

Implements a new proxy rotation system with the following changes:
- Add ProxyRotationStrategy abstract base class
- Add RoundRobinProxyStrategy concrete implementation
- Integrate proxy rotation with AsyncWebCrawler
- Add proxy_rotation_strategy parameter to CrawlerRunConfig
- Add example script demonstrating proxy rotation usage
- Remove deprecated synchronous WebCrawler code
- Clean up rate limiting documentation

BREAKING CHANGE: Removed synchronous WebCrawler support and related rate limiting configurations
This commit is contained in:
UncleCode
2025-02-09 18:49:10 +08:00
parent b957ff2ecd
commit 19df96ed56
12 changed files with 257 additions and 162 deletions

View File

@@ -8,6 +8,10 @@ from .content_scraping_strategy import (
WebScrapingStrategy,
LXMLWebScrapingStrategy,
)
from .proxy_strategy import (
ProxyRotationStrategy,
RoundRobinProxyStrategy,
)
from .extraction_strategy import (
ExtractionStrategy,
LLMExtractionStrategy,
@@ -60,31 +64,33 @@ __all__ = [
"DisplayMode",
"MarkdownGenerationResult",
"Crawl4aiDockerClient",
"ProxyRotationStrategy",
"RoundRobinProxyStrategy",
]
def is_sync_version_installed():
try:
import selenium # noqa
# def is_sync_version_installed():
# try:
# import selenium # noqa
return True
except ImportError:
return False
# return True
# except ImportError:
# return False
if is_sync_version_installed():
try:
from .web_crawler import WebCrawler
# if is_sync_version_installed():
# try:
# from .web_crawler import WebCrawler
__all__.append("WebCrawler")
except ImportError:
print(
"Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
)
else:
WebCrawler = None
# import warnings
# print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
# __all__.append("WebCrawler")
# except ImportError:
# print(
# "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
# )
# else:
# WebCrawler = None
# # import warnings
# # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
# Disable all Pydantic warnings
warnings.filterwarnings("ignore", module="pydantic")

View File

@@ -16,6 +16,7 @@ from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrat
from .deep_crawling import DeepCrawlStrategy
from typing import Union, List
from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy
import inspect
from typing import Any, Dict, Optional
@@ -542,6 +543,7 @@ class CrawlerRunConfig():
parser_type: str = "lxml",
scraping_strategy: ContentScrapingStrategy = None,
proxy_config: dict = None,
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
# SSL Parameters
fetch_ssl_certificate: bool = False,
# Caching Parameters
@@ -620,6 +622,7 @@ class CrawlerRunConfig():
self.parser_type = parser_type
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
self.proxy_config = proxy_config
self.proxy_rotation_strategy = proxy_rotation_strategy
# SSL Parameters
self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -731,6 +734,7 @@ class CrawlerRunConfig():
parser_type=kwargs.get("parser_type", "lxml"),
scraping_strategy=kwargs.get("scraping_strategy"),
proxy_config=kwargs.get("proxy_config"),
proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
# SSL Parameters
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
# Caching Parameters
@@ -827,6 +831,7 @@ class CrawlerRunConfig():
"parser_type": self.parser_type,
"scraping_strategy": self.scraping_strategy,
"proxy_config": self.proxy_config,
"proxy_rotation_strategy": self.proxy_rotation_strategy,
"fetch_ssl_certificate": self.fetch_ssl_certificate,
"cache_mode": self.cache_mode,
"session_id": self.session_id,

View File

@@ -394,6 +394,19 @@ class AsyncWebCrawler:
tag="FETCH",
)
# Update proxy configuration from rotation strategy if available
if config and config.proxy_rotation_strategy:
next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
if next_proxy:
if verbose:
self.logger.info(
message="Switch proxy: {proxy}",
tag="PROXY",
params={"proxy": next_proxy.get("server")},
)
config.proxy_config = next_proxy
# config = config.clone(proxy_config=next_proxy)
# Fetch fresh content if needed
if not cached_result or not html:
t1 = time.perf_counter()

View File

@@ -0,0 +1,43 @@
from typing import List, Dict, Optional
from abc import ABC, abstractmethod
from itertools import cycle
class ProxyRotationStrategy(ABC):
"""Base abstract class for proxy rotation strategies"""
@abstractmethod
async def get_next_proxy(self) -> Optional[Dict]:
"""Get next proxy configuration from the strategy"""
pass
@abstractmethod
def add_proxies(self, proxies: List[Dict]):
"""Add proxy configurations to the strategy"""
pass
class RoundRobinProxyStrategy(ProxyRotationStrategy):
"""Simple round-robin proxy rotation strategy"""
def __init__(self, proxies: List[Dict] = None):
"""
Initialize with optional list of proxy configurations
Args:
proxies: List of proxy config dictionaries, each containing at least
'server' key with proxy URL
"""
self._proxies = []
self._proxy_cycle = None
if proxies:
self.add_proxies(proxies)
def add_proxies(self, proxies: List[Dict]):
"""Add new proxies to the rotation pool"""
self._proxies.extend(proxies)
self._proxy_cycle = cycle(self._proxies)
async def get_next_proxy(self) -> Optional[Dict]:
"""Get next proxy in round-robin fashion"""
if not self._proxy_cycle:
return None
return next(self._proxy_cycle)