feat(proxy): add proxy rotation strategy
Implements a new proxy rotation system with the following changes: - Add ProxyRotationStrategy abstract base class - Add RoundRobinProxyStrategy concrete implementation - Integrate proxy rotation with AsyncWebCrawler - Add proxy_rotation_strategy parameter to CrawlerRunConfig - Add example script demonstrating proxy rotation usage - Remove deprecated synchronous WebCrawler code - Clean up rate limiting documentation BREAKING CHANGE: Removed synchronous WebCrawler support and related rate limiting configurations
This commit is contained in:
@@ -8,6 +8,10 @@ from .content_scraping_strategy import (
|
|||||||
WebScrapingStrategy,
|
WebScrapingStrategy,
|
||||||
LXMLWebScrapingStrategy,
|
LXMLWebScrapingStrategy,
|
||||||
)
|
)
|
||||||
|
from .proxy_strategy import (
|
||||||
|
ProxyRotationStrategy,
|
||||||
|
RoundRobinProxyStrategy,
|
||||||
|
)
|
||||||
from .extraction_strategy import (
|
from .extraction_strategy import (
|
||||||
ExtractionStrategy,
|
ExtractionStrategy,
|
||||||
LLMExtractionStrategy,
|
LLMExtractionStrategy,
|
||||||
@@ -60,31 +64,33 @@ __all__ = [
|
|||||||
"DisplayMode",
|
"DisplayMode",
|
||||||
"MarkdownGenerationResult",
|
"MarkdownGenerationResult",
|
||||||
"Crawl4aiDockerClient",
|
"Crawl4aiDockerClient",
|
||||||
|
"ProxyRotationStrategy",
|
||||||
|
"RoundRobinProxyStrategy",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def is_sync_version_installed():
|
# def is_sync_version_installed():
|
||||||
try:
|
# try:
|
||||||
import selenium # noqa
|
# import selenium # noqa
|
||||||
|
|
||||||
return True
|
# return True
|
||||||
except ImportError:
|
# except ImportError:
|
||||||
return False
|
# return False
|
||||||
|
|
||||||
|
|
||||||
if is_sync_version_installed():
|
# if is_sync_version_installed():
|
||||||
try:
|
# try:
|
||||||
from .web_crawler import WebCrawler
|
# from .web_crawler import WebCrawler
|
||||||
|
|
||||||
__all__.append("WebCrawler")
|
# __all__.append("WebCrawler")
|
||||||
except ImportError:
|
# except ImportError:
|
||||||
print(
|
# print(
|
||||||
"Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
|
# "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
|
||||||
)
|
# )
|
||||||
else:
|
# else:
|
||||||
WebCrawler = None
|
# WebCrawler = None
|
||||||
# import warnings
|
# # import warnings
|
||||||
# print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
# # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
||||||
|
|
||||||
# Disable all Pydantic warnings
|
# Disable all Pydantic warnings
|
||||||
warnings.filterwarnings("ignore", module="pydantic")
|
warnings.filterwarnings("ignore", module="pydantic")
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrat
|
|||||||
from .deep_crawling import DeepCrawlStrategy
|
from .deep_crawling import DeepCrawlStrategy
|
||||||
from typing import Union, List
|
from typing import Union, List
|
||||||
from .cache_context import CacheMode
|
from .cache_context import CacheMode
|
||||||
|
from .proxy_strategy import ProxyRotationStrategy
|
||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
@@ -542,6 +543,7 @@ class CrawlerRunConfig():
|
|||||||
parser_type: str = "lxml",
|
parser_type: str = "lxml",
|
||||||
scraping_strategy: ContentScrapingStrategy = None,
|
scraping_strategy: ContentScrapingStrategy = None,
|
||||||
proxy_config: dict = None,
|
proxy_config: dict = None,
|
||||||
|
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
fetch_ssl_certificate: bool = False,
|
fetch_ssl_certificate: bool = False,
|
||||||
# Caching Parameters
|
# Caching Parameters
|
||||||
@@ -620,6 +622,7 @@ class CrawlerRunConfig():
|
|||||||
self.parser_type = parser_type
|
self.parser_type = parser_type
|
||||||
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
||||||
self.proxy_config = proxy_config
|
self.proxy_config = proxy_config
|
||||||
|
self.proxy_rotation_strategy = proxy_rotation_strategy
|
||||||
|
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
self.fetch_ssl_certificate = fetch_ssl_certificate
|
self.fetch_ssl_certificate = fetch_ssl_certificate
|
||||||
@@ -731,6 +734,7 @@ class CrawlerRunConfig():
|
|||||||
parser_type=kwargs.get("parser_type", "lxml"),
|
parser_type=kwargs.get("parser_type", "lxml"),
|
||||||
scraping_strategy=kwargs.get("scraping_strategy"),
|
scraping_strategy=kwargs.get("scraping_strategy"),
|
||||||
proxy_config=kwargs.get("proxy_config"),
|
proxy_config=kwargs.get("proxy_config"),
|
||||||
|
proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
||||||
# Caching Parameters
|
# Caching Parameters
|
||||||
@@ -827,6 +831,7 @@ class CrawlerRunConfig():
|
|||||||
"parser_type": self.parser_type,
|
"parser_type": self.parser_type,
|
||||||
"scraping_strategy": self.scraping_strategy,
|
"scraping_strategy": self.scraping_strategy,
|
||||||
"proxy_config": self.proxy_config,
|
"proxy_config": self.proxy_config,
|
||||||
|
"proxy_rotation_strategy": self.proxy_rotation_strategy,
|
||||||
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
||||||
"cache_mode": self.cache_mode,
|
"cache_mode": self.cache_mode,
|
||||||
"session_id": self.session_id,
|
"session_id": self.session_id,
|
||||||
|
|||||||
@@ -394,6 +394,19 @@ class AsyncWebCrawler:
|
|||||||
tag="FETCH",
|
tag="FETCH",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Update proxy configuration from rotation strategy if available
|
||||||
|
if config and config.proxy_rotation_strategy:
|
||||||
|
next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
|
||||||
|
if next_proxy:
|
||||||
|
if verbose:
|
||||||
|
self.logger.info(
|
||||||
|
message="Switch proxy: {proxy}",
|
||||||
|
tag="PROXY",
|
||||||
|
params={"proxy": next_proxy.get("server")},
|
||||||
|
)
|
||||||
|
config.proxy_config = next_proxy
|
||||||
|
# config = config.clone(proxy_config=next_proxy)
|
||||||
|
|
||||||
# Fetch fresh content if needed
|
# Fetch fresh content if needed
|
||||||
if not cached_result or not html:
|
if not cached_result or not html:
|
||||||
t1 = time.perf_counter()
|
t1 = time.perf_counter()
|
||||||
|
|||||||
43
crawl4ai/proxy_strategy.py
Normal file
43
crawl4ai/proxy_strategy.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
from typing import List, Dict, Optional
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from itertools import cycle
|
||||||
|
|
||||||
|
class ProxyRotationStrategy(ABC):
|
||||||
|
"""Base abstract class for proxy rotation strategies"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def get_next_proxy(self) -> Optional[Dict]:
|
||||||
|
"""Get next proxy configuration from the strategy"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def add_proxies(self, proxies: List[Dict]):
|
||||||
|
"""Add proxy configurations to the strategy"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class RoundRobinProxyStrategy(ProxyRotationStrategy):
|
||||||
|
"""Simple round-robin proxy rotation strategy"""
|
||||||
|
|
||||||
|
def __init__(self, proxies: List[Dict] = None):
|
||||||
|
"""
|
||||||
|
Initialize with optional list of proxy configurations
|
||||||
|
|
||||||
|
Args:
|
||||||
|
proxies: List of proxy config dictionaries, each containing at least
|
||||||
|
'server' key with proxy URL
|
||||||
|
"""
|
||||||
|
self._proxies = []
|
||||||
|
self._proxy_cycle = None
|
||||||
|
if proxies:
|
||||||
|
self.add_proxies(proxies)
|
||||||
|
|
||||||
|
def add_proxies(self, proxies: List[Dict]):
|
||||||
|
"""Add new proxies to the rotation pool"""
|
||||||
|
self._proxies.extend(proxies)
|
||||||
|
self._proxy_cycle = cycle(self._proxies)
|
||||||
|
|
||||||
|
async def get_next_proxy(self) -> Optional[Dict]:
|
||||||
|
"""Get next proxy in round-robin fashion"""
|
||||||
|
if not self._proxy_cycle:
|
||||||
|
return None
|
||||||
|
return next(self._proxy_cycle)
|
||||||
161
docs/examples/proxy_rotation_demo.py
Normal file
161
docs/examples/proxy_rotation_demo.py
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
from typing import List, Dict
|
||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler,
|
||||||
|
BrowserConfig,
|
||||||
|
CrawlerRunConfig,
|
||||||
|
CacheMode,
|
||||||
|
RoundRobinProxyStrategy
|
||||||
|
)
|
||||||
|
|
||||||
|
def load_proxies_from_env() -> List[Dict]:
|
||||||
|
"""Load proxies from PROXIES environment variable"""
|
||||||
|
proxies = []
|
||||||
|
try:
|
||||||
|
proxy_list = os.getenv("PROXIES", "").split(",")
|
||||||
|
for proxy in proxy_list:
|
||||||
|
if not proxy:
|
||||||
|
continue
|
||||||
|
ip, port, username, password = proxy.split(":")
|
||||||
|
proxies.append({
|
||||||
|
"server": f"http://{ip}:{port}",
|
||||||
|
"username": username,
|
||||||
|
"password": password,
|
||||||
|
"ip": ip # Store original IP for verification
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading proxies from environment: {e}")
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
async def demo_proxy_rotation():
|
||||||
|
"""
|
||||||
|
Proxy Rotation Demo using RoundRobinProxyStrategy
|
||||||
|
===============================================
|
||||||
|
Demonstrates proxy rotation using the strategy pattern.
|
||||||
|
"""
|
||||||
|
print("\n=== Proxy Rotation Demo (Round Robin) ===")
|
||||||
|
|
||||||
|
# Load proxies and create rotation strategy
|
||||||
|
proxies = load_proxies_from_env()
|
||||||
|
if not proxies:
|
||||||
|
print("No proxies found in environment. Set PROXIES env variable!")
|
||||||
|
return
|
||||||
|
|
||||||
|
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
||||||
|
|
||||||
|
# Create configs
|
||||||
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
proxy_rotation_strategy=proxy_strategy
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test URLs
|
||||||
|
urls = ["https://httpbin.org/ip"] * len(proxies) # Test each proxy once
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
for url in urls:
|
||||||
|
result = await crawler.arun(url=url, config=run_config)
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
# Extract IP from response
|
||||||
|
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||||
|
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
||||||
|
|
||||||
|
if current_proxy:
|
||||||
|
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
|
||||||
|
verified = ip_match and ip_match.group(0) == current_proxy['ip']
|
||||||
|
if verified:
|
||||||
|
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
|
||||||
|
else:
|
||||||
|
print("❌ Proxy failed or IP mismatch!")
|
||||||
|
else:
|
||||||
|
print(f"Request failed: {result.error_message}")
|
||||||
|
|
||||||
|
async def demo_proxy_rotation_batch():
|
||||||
|
"""
|
||||||
|
Proxy Rotation Demo with Batch Processing
|
||||||
|
=======================================
|
||||||
|
Demonstrates proxy rotation using arun_many with memory dispatcher.
|
||||||
|
"""
|
||||||
|
print("\n=== Proxy Rotation Batch Demo ===")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load proxies and create rotation strategy
|
||||||
|
proxies = load_proxies_from_env()
|
||||||
|
if not proxies:
|
||||||
|
print("No proxies found in environment. Set PROXIES env variable!")
|
||||||
|
return
|
||||||
|
|
||||||
|
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
||||||
|
|
||||||
|
# Configurations
|
||||||
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
proxy_rotation_strategy=proxy_strategy,
|
||||||
|
markdown_generator=DefaultMarkdownGenerator()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test URLs - multiple requests to test rotation
|
||||||
|
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
|
||||||
|
|
||||||
|
print("\n📈 Initializing crawler with proxy rotation...")
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
monitor = CrawlerMonitor(
|
||||||
|
max_visible_rows=10,
|
||||||
|
display_mode=DisplayMode.DETAILED
|
||||||
|
)
|
||||||
|
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
memory_threshold_percent=80.0,
|
||||||
|
check_interval=0.5,
|
||||||
|
max_session_permit=1, #len(proxies), # Match concurrent sessions to proxy count
|
||||||
|
# monitor=monitor
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n🚀 Starting batch crawl with proxy rotation...")
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=urls,
|
||||||
|
config=run_config,
|
||||||
|
dispatcher=dispatcher
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify results
|
||||||
|
success_count = 0
|
||||||
|
for result in results:
|
||||||
|
if result.success:
|
||||||
|
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||||
|
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
||||||
|
|
||||||
|
if current_proxy and ip_match:
|
||||||
|
print(f"URL {result.url}")
|
||||||
|
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0)}")
|
||||||
|
verified = ip_match.group(0) == current_proxy['ip']
|
||||||
|
if verified:
|
||||||
|
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
|
||||||
|
success_count += 1
|
||||||
|
else:
|
||||||
|
print("❌ Proxy failed or IP mismatch!")
|
||||||
|
print("---")
|
||||||
|
|
||||||
|
print(f"\n✅ Completed {len(results)} requests with {success_count} successful proxy verifications")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Error in proxy rotation batch demo: {str(e)}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import (
|
||||||
|
CrawlerMonitor,
|
||||||
|
DisplayMode,
|
||||||
|
MemoryAdaptiveDispatcher,
|
||||||
|
DefaultMarkdownGenerator
|
||||||
|
)
|
||||||
|
|
||||||
|
async def run_demos():
|
||||||
|
# await demo_proxy_rotation() # Original single-request demo
|
||||||
|
await demo_proxy_rotation_batch() # New batch processing demo
|
||||||
|
|
||||||
|
asyncio.run(run_demos())
|
||||||
@@ -160,41 +160,9 @@ The `arun_many()` method now uses an intelligent dispatcher that:
|
|||||||
|
|
||||||
### 4.2 Example Usage
|
### 4.2 Example Usage
|
||||||
|
|
||||||
|
Check page [Multi-url Crawling](../advanced/multi-url-crawling.md) for a detailed example of how to use `arun_many()`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, RateLimitConfig
|
|
||||||
from crawl4ai.dispatcher import DisplayMode
|
|
||||||
|
|
||||||
# Configure browser
|
|
||||||
browser_cfg = BrowserConfig(headless=True)
|
|
||||||
|
|
||||||
# Configure crawler with rate limiting
|
|
||||||
run_cfg = CrawlerRunConfig(
|
|
||||||
# Enable rate limiting
|
|
||||||
enable_rate_limiting=True,
|
|
||||||
rate_limit_config=RateLimitConfig(
|
|
||||||
base_delay=(1.0, 2.0), # Random delay between 1-2 seconds
|
|
||||||
max_delay=30.0, # Maximum delay after rate limit hits
|
|
||||||
max_retries=2, # Number of retries before giving up
|
|
||||||
rate_limit_codes=[429, 503] # Status codes that trigger rate limiting
|
|
||||||
),
|
|
||||||
# Resource monitoring
|
|
||||||
memory_threshold_percent=70.0, # Pause if memory exceeds this
|
|
||||||
check_interval=0.5, # How often to check resources
|
|
||||||
max_session_permit=3, # Maximum concurrent crawls
|
|
||||||
display_mode=DisplayMode.DETAILED.value # Show detailed progress
|
|
||||||
)
|
|
||||||
|
|
||||||
urls = [
|
|
||||||
"https://example.com/page1",
|
|
||||||
"https://example.com/page2",
|
|
||||||
"https://example.com/page3"
|
|
||||||
]
|
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
|
||||||
results = await crawler.arun_many(urls, config=run_cfg)
|
|
||||||
for result in results:
|
|
||||||
print(f"URL: {result.url}, Success: {result.success}")
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4.3 Key Features
|
### 4.3 Key Features
|
||||||
|
|
||||||
|
|||||||
@@ -159,32 +159,7 @@ Use these for link-level content filtering (often to keep crawls “internal”
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### G) **Rate Limiting & Resource Management**
|
### G) **Debug & Logging**
|
||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
|
||||||
|------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
|
|
||||||
| **`enable_rate_limiting`** | `bool` (default: `False`) | Enable intelligent rate limiting for multiple URLs |
|
|
||||||
| **`rate_limit_config`** | `RateLimitConfig` (default: `None`) | Configuration for rate limiting behavior |
|
|
||||||
|
|
||||||
The `RateLimitConfig` class has these fields:
|
|
||||||
|
|
||||||
| **Field** | **Type / Default** | **What It Does** |
|
|
||||||
|--------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
|
|
||||||
| **`base_delay`** | `Tuple[float, float]` (1.0, 3.0) | Random delay range between requests to the same domain |
|
|
||||||
| **`max_delay`** | `float` (60.0) | Maximum delay after rate limit detection |
|
|
||||||
| **`max_retries`** | `int` (3) | Number of retries before giving up on rate-limited requests |
|
|
||||||
| **`rate_limit_codes`** | `List[int]` ([429, 503]) | HTTP status codes that trigger rate limiting behavior |
|
|
||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
|
||||||
|-------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
|
|
||||||
| **`memory_threshold_percent`** | `float` (70.0) | Maximum memory usage before pausing new crawls |
|
|
||||||
| **`check_interval`** | `float` (1.0) | How often to check system resources (in seconds) |
|
|
||||||
| **`max_session_permit`** | `int` (20) | Maximum number of concurrent crawl sessions |
|
|
||||||
| **`display_mode`** | `str` (`None`, "DETAILED", "AGGREGATED") | How to display progress information |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### H) **Debug & Logging**
|
|
||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|----------------|--------------------|---------------------------------------------------------------------------|
|
|----------------|--------------------|---------------------------------------------------------------------------|
|
||||||
@@ -218,7 +193,7 @@ The `clone()` method is particularly useful when you need slightly different con
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, RateLimitConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# Configure the browser
|
# Configure the browser
|
||||||
@@ -239,17 +214,6 @@ async def main():
|
|||||||
exclude_external_links=True,
|
exclude_external_links=True,
|
||||||
wait_for="css:.article-loaded",
|
wait_for="css:.article-loaded",
|
||||||
screenshot=True,
|
screenshot=True,
|
||||||
enable_rate_limiting=True,
|
|
||||||
rate_limit_config=RateLimitConfig(
|
|
||||||
base_delay=(1.0, 3.0),
|
|
||||||
max_delay=60.0,
|
|
||||||
max_retries=3,
|
|
||||||
rate_limit_codes=[429, 503]
|
|
||||||
),
|
|
||||||
memory_threshold_percent=70.0,
|
|
||||||
check_interval=1.0,
|
|
||||||
max_session_permit=20,
|
|
||||||
display_mode="DETAILED",
|
|
||||||
stream=True
|
stream=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -186,23 +186,19 @@ class CrawlerRunConfig:
|
|||||||
- If `True`, enables rate limiting for batch processing.
|
- If `True`, enables rate limiting for batch processing.
|
||||||
- Requires `rate_limit_config` to be set.
|
- Requires `rate_limit_config` to be set.
|
||||||
|
|
||||||
10. **`rate_limit_config`**:
|
10. **`memory_threshold_percent`**:
|
||||||
- A `RateLimitConfig` object controlling rate limiting behavior.
|
|
||||||
- See below for details.
|
|
||||||
|
|
||||||
11. **`memory_threshold_percent`**:
|
|
||||||
- The memory threshold (as a percentage) to monitor.
|
- The memory threshold (as a percentage) to monitor.
|
||||||
- If exceeded, the crawler will pause or slow down.
|
- If exceeded, the crawler will pause or slow down.
|
||||||
|
|
||||||
12. **`check_interval`**:
|
11. **`check_interval`**:
|
||||||
- The interval (in seconds) to check system resources.
|
- The interval (in seconds) to check system resources.
|
||||||
- Affects how often memory and CPU usage are monitored.
|
- Affects how often memory and CPU usage are monitored.
|
||||||
|
|
||||||
13. **`max_session_permit`**:
|
12. **`max_session_permit`**:
|
||||||
- The maximum number of concurrent crawl sessions.
|
- The maximum number of concurrent crawl sessions.
|
||||||
- Helps prevent overwhelming the system.
|
- Helps prevent overwhelming the system.
|
||||||
|
|
||||||
14. **`display_mode`**:
|
13. **`display_mode`**:
|
||||||
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
||||||
- Affects how much information is printed during the crawl.
|
- Affects how much information is printed during the crawl.
|
||||||
|
|
||||||
@@ -236,58 +232,6 @@ The `clone()` method:
|
|||||||
- Leaves the original configuration unchanged
|
- Leaves the original configuration unchanged
|
||||||
- Perfect for creating variations without repeating all parameters
|
- Perfect for creating variations without repeating all parameters
|
||||||
|
|
||||||
### Rate Limiting & Resource Management
|
|
||||||
|
|
||||||
For batch processing with `arun_many()`, you can enable intelligent rate limiting:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import RateLimitConfig
|
|
||||||
|
|
||||||
config = CrawlerRunConfig(
|
|
||||||
enable_rate_limiting=True,
|
|
||||||
rate_limit_config=RateLimitConfig(
|
|
||||||
base_delay=(1.0, 3.0), # Random delay range
|
|
||||||
max_delay=60.0, # Max delay after rate limits
|
|
||||||
max_retries=3, # Retries before giving up
|
|
||||||
rate_limit_codes=[429, 503] # Status codes to watch
|
|
||||||
),
|
|
||||||
memory_threshold_percent=70.0, # Memory threshold
|
|
||||||
check_interval=1.0, # Resource check interval
|
|
||||||
max_session_permit=20, # Max concurrent crawls
|
|
||||||
display_mode="DETAILED" # Progress display mode
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
This configuration:
|
|
||||||
- Implements intelligent rate limiting per domain
|
|
||||||
- Monitors system resources
|
|
||||||
- Provides detailed progress information
|
|
||||||
- Manages concurrent crawls efficiently
|
|
||||||
|
|
||||||
**Minimal Example**:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
||||||
|
|
||||||
crawl_conf = CrawlerRunConfig(
|
|
||||||
js_code="document.querySelector('button#loadMore')?.click()",
|
|
||||||
wait_for="css:.loaded-content",
|
|
||||||
screenshot=True,
|
|
||||||
enable_rate_limiting=True,
|
|
||||||
rate_limit_config=RateLimitConfig(
|
|
||||||
base_delay=(1.0, 3.0),
|
|
||||||
max_delay=60.0,
|
|
||||||
max_retries=3,
|
|
||||||
rate_limit_codes=[429, 503]
|
|
||||||
),
|
|
||||||
stream=True # Enable streaming
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
result = await crawler.arun(url="https://example.com", config=crawl_conf)
|
|
||||||
print(result.screenshot[:100]) # Base64-encoded PNG snippet
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 3. Putting It All Together
|
## 3. Putting It All Together
|
||||||
@@ -322,13 +266,6 @@ async def main():
|
|||||||
run_conf = CrawlerRunConfig(
|
run_conf = CrawlerRunConfig(
|
||||||
extraction_strategy=extraction,
|
extraction_strategy=extraction,
|
||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
enable_rate_limiting=True,
|
|
||||||
rate_limit_config=RateLimitConfig(
|
|
||||||
base_delay=(1.0, 3.0),
|
|
||||||
max_delay=60.0,
|
|
||||||
max_retries=3,
|
|
||||||
rate_limit_codes=[429, 503]
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_conf) as crawler:
|
async with AsyncWebCrawler(config=browser_conf) as crawler:
|
||||||
|
|||||||
@@ -31,9 +31,6 @@ import re
|
|||||||
import random
|
import random
|
||||||
from typing import Optional, Dict
|
from typing import Optional, Dict
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
from crawl4ai import (
|
from crawl4ai import (
|
||||||
AsyncWebCrawler,
|
AsyncWebCrawler,
|
||||||
BrowserConfig,
|
BrowserConfig,
|
||||||
@@ -48,6 +45,7 @@ from crawl4ai import (
|
|||||||
LLMContentFilter
|
LLMContentFilter
|
||||||
)
|
)
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
async def demo_memory_dispatcher():
|
async def demo_memory_dispatcher():
|
||||||
"""Demonstrates the new memory-efficient dispatcher system.
|
"""Demonstrates the new memory-efficient dispatcher system.
|
||||||
@@ -283,7 +281,7 @@ async def demo_proxy_rotation():
|
|||||||
"""
|
"""
|
||||||
print("\n=== 8. Proxy Rotation Demo ===")
|
print("\n=== 8. Proxy Rotation Demo ===")
|
||||||
|
|
||||||
async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
|
async def get_next_proxy(proxy_file: str = "proxies.txt") -> Optional[Dict]:
|
||||||
"""Get next proxy from local file"""
|
"""Get next proxy from local file"""
|
||||||
try:
|
try:
|
||||||
proxies = os.getenv("PROXIES", "").split(",")
|
proxies = os.getenv("PROXIES", "").split(",")
|
||||||
@@ -323,7 +321,7 @@ async def demo_proxy_rotation():
|
|||||||
if verified:
|
if verified:
|
||||||
print(f"✅ Proxy working! IP matches: {proxy['ip']}")
|
print(f"✅ Proxy working! IP matches: {proxy['ip']}")
|
||||||
else:
|
else:
|
||||||
print(f"❌ Proxy failed or IP mismatch!")
|
print("❌ Proxy failed or IP mismatch!")
|
||||||
else:
|
else:
|
||||||
print(f"Failed with proxy {proxy['ip']}")
|
print(f"Failed with proxy {proxy['ip']}")
|
||||||
|
|
||||||
Reference in New Issue
Block a user