From 19df96ed565f934dd4c4226dd0cc61db039518cc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 9 Feb 2025 18:49:10 +0800 Subject: [PATCH] feat(proxy): add proxy rotation strategy Implements a new proxy rotation system with the following changes: - Add ProxyRotationStrategy abstract base class - Add RoundRobinProxyStrategy concrete implementation - Integrate proxy rotation with AsyncWebCrawler - Add proxy_rotation_strategy parameter to CrawlerRunConfig - Add example script demonstrating proxy rotation usage - Remove deprecated synchronous WebCrawler code - Clean up rate limiting documentation BREAKING CHANGE: Removed synchronous WebCrawler support and related rate limiting configurations --- crawl4ai/__init__.py | 42 +++-- crawl4ai/async_configs.py | 5 + crawl4ai/async_webcrawler.py | 13 ++ crawl4ai/proxy_strategy.py | 43 +++++ docs/examples/proxy_rotation_demo.py | 161 ++++++++++++++++++ docs/md_v2/api/async-webcrawler.md | 36 +--- docs/md_v2/api/parameters.md | 40 +---- docs/md_v2/core/browser-crawler-config.md | 71 +------- ...rawl4AI_v0.3.72_Release_Announcement.ipynb | 0 .../v0.3.74.overview.py | 0 .../v0_4_24_walkthrough.py | 0 .../v0_4_3b2_features_demo.py | 8 +- 12 files changed, 257 insertions(+), 162 deletions(-) create mode 100644 crawl4ai/proxy_strategy.py create mode 100644 docs/examples/proxy_rotation_demo.py rename docs/{notebooks => releases_review}/Crawl4AI_v0.3.72_Release_Announcement.ipynb (100%) rename docs/{examples => releases_review}/v0.3.74.overview.py (100%) rename docs/{examples => releases_review}/v0_4_24_walkthrough.py (100%) rename docs/{examples => releases_review}/v0_4_3b2_features_demo.py (98%) diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 8caa0be2..ea7c2191 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -8,6 +8,10 @@ from .content_scraping_strategy import ( WebScrapingStrategy, LXMLWebScrapingStrategy, ) +from .proxy_strategy import ( + ProxyRotationStrategy, + RoundRobinProxyStrategy, +) from .extraction_strategy import ( ExtractionStrategy, LLMExtractionStrategy, @@ -60,31 +64,33 @@ __all__ = [ "DisplayMode", "MarkdownGenerationResult", "Crawl4aiDockerClient", + "ProxyRotationStrategy", + "RoundRobinProxyStrategy", ] -def is_sync_version_installed(): - try: - import selenium # noqa +# def is_sync_version_installed(): +# try: +# import selenium # noqa - return True - except ImportError: - return False +# return True +# except ImportError: +# return False -if is_sync_version_installed(): - try: - from .web_crawler import WebCrawler +# if is_sync_version_installed(): +# try: +# from .web_crawler import WebCrawler - __all__.append("WebCrawler") - except ImportError: - print( - "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies." - ) -else: - WebCrawler = None - # import warnings - # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.") +# __all__.append("WebCrawler") +# except ImportError: +# print( +# "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies." +# ) +# else: +# WebCrawler = None +# # import warnings +# # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.") # Disable all Pydantic warnings warnings.filterwarnings("ignore", module="pydantic") diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index dbf8f0c4..6c3b294d 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -16,6 +16,7 @@ from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrat from .deep_crawling import DeepCrawlStrategy from typing import Union, List from .cache_context import CacheMode +from .proxy_strategy import ProxyRotationStrategy import inspect from typing import Any, Dict, Optional @@ -542,6 +543,7 @@ class CrawlerRunConfig(): parser_type: str = "lxml", scraping_strategy: ContentScrapingStrategy = None, proxy_config: dict = None, + proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None, # SSL Parameters fetch_ssl_certificate: bool = False, # Caching Parameters @@ -620,6 +622,7 @@ class CrawlerRunConfig(): self.parser_type = parser_type self.scraping_strategy = scraping_strategy or WebScrapingStrategy() self.proxy_config = proxy_config + self.proxy_rotation_strategy = proxy_rotation_strategy # SSL Parameters self.fetch_ssl_certificate = fetch_ssl_certificate @@ -731,6 +734,7 @@ class CrawlerRunConfig(): parser_type=kwargs.get("parser_type", "lxml"), scraping_strategy=kwargs.get("scraping_strategy"), proxy_config=kwargs.get("proxy_config"), + proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"), # SSL Parameters fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), # Caching Parameters @@ -827,6 +831,7 @@ class CrawlerRunConfig(): "parser_type": self.parser_type, "scraping_strategy": self.scraping_strategy, "proxy_config": self.proxy_config, + "proxy_rotation_strategy": self.proxy_rotation_strategy, "fetch_ssl_certificate": self.fetch_ssl_certificate, "cache_mode": self.cache_mode, "session_id": self.session_id, diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index dfcd65c0..1c76dd79 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -394,6 +394,19 @@ class AsyncWebCrawler: tag="FETCH", ) + # Update proxy configuration from rotation strategy if available + if config and config.proxy_rotation_strategy: + next_proxy = await config.proxy_rotation_strategy.get_next_proxy() + if next_proxy: + if verbose: + self.logger.info( + message="Switch proxy: {proxy}", + tag="PROXY", + params={"proxy": next_proxy.get("server")}, + ) + config.proxy_config = next_proxy + # config = config.clone(proxy_config=next_proxy) + # Fetch fresh content if needed if not cached_result or not html: t1 = time.perf_counter() diff --git a/crawl4ai/proxy_strategy.py b/crawl4ai/proxy_strategy.py new file mode 100644 index 00000000..2db3cad6 --- /dev/null +++ b/crawl4ai/proxy_strategy.py @@ -0,0 +1,43 @@ +from typing import List, Dict, Optional +from abc import ABC, abstractmethod +from itertools import cycle + +class ProxyRotationStrategy(ABC): + """Base abstract class for proxy rotation strategies""" + + @abstractmethod + async def get_next_proxy(self) -> Optional[Dict]: + """Get next proxy configuration from the strategy""" + pass + + @abstractmethod + def add_proxies(self, proxies: List[Dict]): + """Add proxy configurations to the strategy""" + pass + +class RoundRobinProxyStrategy(ProxyRotationStrategy): + """Simple round-robin proxy rotation strategy""" + + def __init__(self, proxies: List[Dict] = None): + """ + Initialize with optional list of proxy configurations + + Args: + proxies: List of proxy config dictionaries, each containing at least + 'server' key with proxy URL + """ + self._proxies = [] + self._proxy_cycle = None + if proxies: + self.add_proxies(proxies) + + def add_proxies(self, proxies: List[Dict]): + """Add new proxies to the rotation pool""" + self._proxies.extend(proxies) + self._proxy_cycle = cycle(self._proxies) + + async def get_next_proxy(self) -> Optional[Dict]: + """Get next proxy in round-robin fashion""" + if not self._proxy_cycle: + return None + return next(self._proxy_cycle) diff --git a/docs/examples/proxy_rotation_demo.py b/docs/examples/proxy_rotation_demo.py new file mode 100644 index 00000000..7efa974d --- /dev/null +++ b/docs/examples/proxy_rotation_demo.py @@ -0,0 +1,161 @@ +import os +import re +from typing import List, Dict +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CacheMode, + RoundRobinProxyStrategy +) + +def load_proxies_from_env() -> List[Dict]: + """Load proxies from PROXIES environment variable""" + proxies = [] + try: + proxy_list = os.getenv("PROXIES", "").split(",") + for proxy in proxy_list: + if not proxy: + continue + ip, port, username, password = proxy.split(":") + proxies.append({ + "server": f"http://{ip}:{port}", + "username": username, + "password": password, + "ip": ip # Store original IP for verification + }) + except Exception as e: + print(f"Error loading proxies from environment: {e}") + return proxies + +async def demo_proxy_rotation(): + """ + Proxy Rotation Demo using RoundRobinProxyStrategy + =============================================== + Demonstrates proxy rotation using the strategy pattern. + """ + print("\n=== Proxy Rotation Demo (Round Robin) ===") + + # Load proxies and create rotation strategy + proxies = load_proxies_from_env() + if not proxies: + print("No proxies found in environment. Set PROXIES env variable!") + return + + proxy_strategy = RoundRobinProxyStrategy(proxies) + + # Create configs + browser_config = BrowserConfig(headless=True, verbose=False) + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + proxy_rotation_strategy=proxy_strategy + ) + + # Test URLs + urls = ["https://httpbin.org/ip"] * len(proxies) # Test each proxy once + + async with AsyncWebCrawler(config=browser_config) as crawler: + for url in urls: + result = await crawler.arun(url=url, config=run_config) + + if result.success: + # Extract IP from response + ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html) + current_proxy = run_config.proxy_config if run_config.proxy_config else None + + if current_proxy: + print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}") + verified = ip_match and ip_match.group(0) == current_proxy['ip'] + if verified: + print(f"βœ… Proxy working! IP matches: {current_proxy['ip']}") + else: + print("❌ Proxy failed or IP mismatch!") + else: + print(f"Request failed: {result.error_message}") + +async def demo_proxy_rotation_batch(): + """ + Proxy Rotation Demo with Batch Processing + ======================================= + Demonstrates proxy rotation using arun_many with memory dispatcher. + """ + print("\n=== Proxy Rotation Batch Demo ===") + + try: + # Load proxies and create rotation strategy + proxies = load_proxies_from_env() + if not proxies: + print("No proxies found in environment. Set PROXIES env variable!") + return + + proxy_strategy = RoundRobinProxyStrategy(proxies) + + # Configurations + browser_config = BrowserConfig(headless=True, verbose=False) + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + proxy_rotation_strategy=proxy_strategy, + markdown_generator=DefaultMarkdownGenerator() + ) + + # Test URLs - multiple requests to test rotation + urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice + + print("\nπŸ“ˆ Initializing crawler with proxy rotation...") + async with AsyncWebCrawler(config=browser_config) as crawler: + monitor = CrawlerMonitor( + max_visible_rows=10, + display_mode=DisplayMode.DETAILED + ) + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=80.0, + check_interval=0.5, + max_session_permit=1, #len(proxies), # Match concurrent sessions to proxy count + # monitor=monitor + ) + + print("\nπŸš€ Starting batch crawl with proxy rotation...") + results = await crawler.arun_many( + urls=urls, + config=run_config, + dispatcher=dispatcher + ) + + # Verify results + success_count = 0 + for result in results: + if result.success: + ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html) + current_proxy = run_config.proxy_config if run_config.proxy_config else None + + if current_proxy and ip_match: + print(f"URL {result.url}") + print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0)}") + verified = ip_match.group(0) == current_proxy['ip'] + if verified: + print(f"βœ… Proxy working! IP matches: {current_proxy['ip']}") + success_count += 1 + else: + print("❌ Proxy failed or IP mismatch!") + print("---") + + print(f"\nβœ… Completed {len(results)} requests with {success_count} successful proxy verifications") + + except Exception as e: + print(f"\n❌ Error in proxy rotation batch demo: {str(e)}") + +if __name__ == "__main__": + import asyncio + from crawl4ai import ( + CrawlerMonitor, + DisplayMode, + MemoryAdaptiveDispatcher, + DefaultMarkdownGenerator + ) + + async def run_demos(): + # await demo_proxy_rotation() # Original single-request demo + await demo_proxy_rotation_batch() # New batch processing demo + + asyncio.run(run_demos()) diff --git a/docs/md_v2/api/async-webcrawler.md b/docs/md_v2/api/async-webcrawler.md index eacd3de4..7f531ecd 100644 --- a/docs/md_v2/api/async-webcrawler.md +++ b/docs/md_v2/api/async-webcrawler.md @@ -160,41 +160,9 @@ The `arun_many()` method now uses an intelligent dispatcher that: ### 4.2 Example Usage +Check page [Multi-url Crawling](../advanced/multi-url-crawling.md) for a detailed example of how to use `arun_many()`. + ```python -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, RateLimitConfig -from crawl4ai.dispatcher import DisplayMode - -# Configure browser -browser_cfg = BrowserConfig(headless=True) - -# Configure crawler with rate limiting -run_cfg = CrawlerRunConfig( - # Enable rate limiting - enable_rate_limiting=True, - rate_limit_config=RateLimitConfig( - base_delay=(1.0, 2.0), # Random delay between 1-2 seconds - max_delay=30.0, # Maximum delay after rate limit hits - max_retries=2, # Number of retries before giving up - rate_limit_codes=[429, 503] # Status codes that trigger rate limiting - ), - # Resource monitoring - memory_threshold_percent=70.0, # Pause if memory exceeds this - check_interval=0.5, # How often to check resources - max_session_permit=3, # Maximum concurrent crawls - display_mode=DisplayMode.DETAILED.value # Show detailed progress -) - -urls = [ - "https://example.com/page1", - "https://example.com/page2", - "https://example.com/page3" -] - -async with AsyncWebCrawler(config=browser_cfg) as crawler: - results = await crawler.arun_many(urls, config=run_cfg) - for result in results: - print(f"URL: {result.url}, Success: {result.success}") -``` ### 4.3 Key Features diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index e60c1025..0b994fd6 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -159,32 +159,7 @@ Use these for link-level content filtering (often to keep crawls β€œinternal” --- -### G) **Rate Limiting & Resource Management** - -| **Parameter** | **Type / Default** | **What It Does** | -|------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------| -| **`enable_rate_limiting`** | `bool` (default: `False`) | Enable intelligent rate limiting for multiple URLs | -| **`rate_limit_config`** | `RateLimitConfig` (default: `None`) | Configuration for rate limiting behavior | - -The `RateLimitConfig` class has these fields: - -| **Field** | **Type / Default** | **What It Does** | -|--------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------| -| **`base_delay`** | `Tuple[float, float]` (1.0, 3.0) | Random delay range between requests to the same domain | -| **`max_delay`** | `float` (60.0) | Maximum delay after rate limit detection | -| **`max_retries`** | `int` (3) | Number of retries before giving up on rate-limited requests | -| **`rate_limit_codes`** | `List[int]` ([429, 503]) | HTTP status codes that trigger rate limiting behavior | - -| **Parameter** | **Type / Default** | **What It Does** | -|-------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------| -| **`memory_threshold_percent`** | `float` (70.0) | Maximum memory usage before pausing new crawls | -| **`check_interval`** | `float` (1.0) | How often to check system resources (in seconds) | -| **`max_session_permit`** | `int` (20) | Maximum number of concurrent crawl sessions | -| **`display_mode`** | `str` (`None`, "DETAILED", "AGGREGATED") | How to display progress information | - ---- - -### H) **Debug & Logging** +### G) **Debug & Logging** | **Parameter** | **Type / Default** | **What It Does** | |----------------|--------------------|---------------------------------------------------------------------------| @@ -218,7 +193,7 @@ The `clone()` method is particularly useful when you need slightly different con ```python import asyncio -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, RateLimitConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode async def main(): # Configure the browser @@ -239,17 +214,6 @@ async def main(): exclude_external_links=True, wait_for="css:.article-loaded", screenshot=True, - enable_rate_limiting=True, - rate_limit_config=RateLimitConfig( - base_delay=(1.0, 3.0), - max_delay=60.0, - max_retries=3, - rate_limit_codes=[429, 503] - ), - memory_threshold_percent=70.0, - check_interval=1.0, - max_session_permit=20, - display_mode="DETAILED", stream=True ) diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md index 8d916738..f80bb04a 100644 --- a/docs/md_v2/core/browser-crawler-config.md +++ b/docs/md_v2/core/browser-crawler-config.md @@ -186,23 +186,19 @@ class CrawlerRunConfig: - If `True`, enables rate limiting for batch processing. - Requires `rate_limit_config` to be set. -10.β€€**`rate_limit_config`**: - - A `RateLimitConfig` object controlling rate limiting behavior. - - See below for details. - -11.β€€**`memory_threshold_percent`**: +10.β€€**`memory_threshold_percent`**: - The memory threshold (as a percentage) to monitor. - If exceeded, the crawler will pause or slow down. -12.β€€**`check_interval`**: +11.β€€**`check_interval`**: - The interval (in seconds) to check system resources. - Affects how often memory and CPU usage are monitored. -13.β€€**`max_session_permit`**: +12.β€€**`max_session_permit`**: - The maximum number of concurrent crawl sessions. - Helps prevent overwhelming the system. -14.β€€**`display_mode`**: +13.β€€**`display_mode`**: - The display mode for progress information (`DETAILED`, `BRIEF`, etc.). - Affects how much information is printed during the crawl. @@ -236,58 +232,6 @@ The `clone()` method: - Leaves the original configuration unchanged - Perfect for creating variations without repeating all parameters -### Rate Limiting & Resource Management - -For batch processing with `arun_many()`, you can enable intelligent rate limiting: - -```python -from crawl4ai import RateLimitConfig - -config = CrawlerRunConfig( - enable_rate_limiting=True, - rate_limit_config=RateLimitConfig( - base_delay=(1.0, 3.0), # Random delay range - max_delay=60.0, # Max delay after rate limits - max_retries=3, # Retries before giving up - rate_limit_codes=[429, 503] # Status codes to watch - ), - memory_threshold_percent=70.0, # Memory threshold - check_interval=1.0, # Resource check interval - max_session_permit=20, # Max concurrent crawls - display_mode="DETAILED" # Progress display mode -) -``` - -This configuration: -- Implements intelligent rate limiting per domain -- Monitors system resources -- Provides detailed progress information -- Manages concurrent crawls efficiently - -**Minimal Example**: - -```python -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig - -crawl_conf = CrawlerRunConfig( - js_code="document.querySelector('button#loadMore')?.click()", - wait_for="css:.loaded-content", - screenshot=True, - enable_rate_limiting=True, - rate_limit_config=RateLimitConfig( - base_delay=(1.0, 3.0), - max_delay=60.0, - max_retries=3, - rate_limit_codes=[429, 503] - ), - stream=True # Enable streaming -) - -async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url="https://example.com", config=crawl_conf) - print(result.screenshot[:100]) # Base64-encoded PNG snippet -``` - --- ## 3. Putting It All Together @@ -322,13 +266,6 @@ async def main(): run_conf = CrawlerRunConfig( extraction_strategy=extraction, cache_mode=CacheMode.BYPASS, - enable_rate_limiting=True, - rate_limit_config=RateLimitConfig( - base_delay=(1.0, 3.0), - max_delay=60.0, - max_retries=3, - rate_limit_codes=[429, 503] - ) ) async with AsyncWebCrawler(config=browser_conf) as crawler: diff --git a/docs/notebooks/Crawl4AI_v0.3.72_Release_Announcement.ipynb b/docs/releases_review/Crawl4AI_v0.3.72_Release_Announcement.ipynb similarity index 100% rename from docs/notebooks/Crawl4AI_v0.3.72_Release_Announcement.ipynb rename to docs/releases_review/Crawl4AI_v0.3.72_Release_Announcement.ipynb diff --git a/docs/examples/v0.3.74.overview.py b/docs/releases_review/v0.3.74.overview.py similarity index 100% rename from docs/examples/v0.3.74.overview.py rename to docs/releases_review/v0.3.74.overview.py diff --git a/docs/examples/v0_4_24_walkthrough.py b/docs/releases_review/v0_4_24_walkthrough.py similarity index 100% rename from docs/examples/v0_4_24_walkthrough.py rename to docs/releases_review/v0_4_24_walkthrough.py diff --git a/docs/examples/v0_4_3b2_features_demo.py b/docs/releases_review/v0_4_3b2_features_demo.py similarity index 98% rename from docs/examples/v0_4_3b2_features_demo.py rename to docs/releases_review/v0_4_3b2_features_demo.py index 1032f346..37862784 100644 --- a/docs/examples/v0_4_3b2_features_demo.py +++ b/docs/releases_review/v0_4_3b2_features_demo.py @@ -31,9 +31,6 @@ import re import random from typing import Optional, Dict from dotenv import load_dotenv - -load_dotenv() - from crawl4ai import ( AsyncWebCrawler, BrowserConfig, @@ -48,6 +45,7 @@ from crawl4ai import ( LLMContentFilter ) +load_dotenv() async def demo_memory_dispatcher(): """Demonstrates the new memory-efficient dispatcher system. @@ -283,7 +281,7 @@ async def demo_proxy_rotation(): """ print("\n=== 8. Proxy Rotation Demo ===") - async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]: + async def get_next_proxy(proxy_file: str = "proxies.txt") -> Optional[Dict]: """Get next proxy from local file""" try: proxies = os.getenv("PROXIES", "").split(",") @@ -323,7 +321,7 @@ async def demo_proxy_rotation(): if verified: print(f"βœ… Proxy working! IP matches: {proxy['ip']}") else: - print(f"❌ Proxy failed or IP mismatch!") + print("❌ Proxy failed or IP mismatch!") else: print(f"Failed with proxy {proxy['ip']}")