- Updated ProxyConfig.from_string to support multiple proxy formats, including URLs with credentials. - Deprecated the 'proxy' parameter in BrowserConfig, replacing it with 'proxy_config' for better flexibility. - Added warnings for deprecated usage and clarified behavior when both parameters are provided. - Updated documentation and tests to reflect changes in proxy configuration handling.
99 lines
3.1 KiB
Markdown
99 lines
3.1 KiB
Markdown
# Proxy
|
|
|
|
## Basic Proxy Setup
|
|
|
|
Simple proxy configuration with `BrowserConfig`:
|
|
|
|
```python
|
|
from crawl4ai.async_configs import BrowserConfig
|
|
|
|
# Using HTTP proxy
|
|
browser_config = BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080"})
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(url="https://example.com")
|
|
|
|
# Using SOCKS proxy
|
|
browser_config = BrowserConfig(proxy_config={"server": "socks5://proxy.example.com:1080"})
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(url="https://example.com")
|
|
```
|
|
|
|
## Authenticated Proxy
|
|
|
|
Use an authenticated proxy with `BrowserConfig`:
|
|
|
|
```python
|
|
from crawl4ai.async_configs import BrowserConfig
|
|
|
|
browser_config = BrowserConfig(proxy_config={
|
|
"server": "http://[host]:[port]",
|
|
"username": "[username]",
|
|
"password": "[password]",
|
|
})
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(url="https://example.com")
|
|
```
|
|
|
|
|
|
## Rotating Proxies
|
|
|
|
Example using a proxy rotation service dynamically:
|
|
|
|
```python
|
|
import re
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
BrowserConfig,
|
|
CrawlerRunConfig,
|
|
CacheMode,
|
|
RoundRobinProxyStrategy,
|
|
)
|
|
import asyncio
|
|
from crawl4ai import ProxyConfig
|
|
async def main():
|
|
# Load proxies and create rotation strategy
|
|
proxies = ProxyConfig.from_env()
|
|
#eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
|
|
if not proxies:
|
|
print("No proxies found in environment. Set PROXIES env variable!")
|
|
return
|
|
|
|
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
|
|
|
# Create configs
|
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
|
run_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
proxy_rotation_strategy=proxy_strategy
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
|
|
|
|
print("\n📈 Initializing crawler with proxy rotation...")
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
print("\n🚀 Starting batch crawl with proxy rotation...")
|
|
results = await crawler.arun_many(
|
|
urls=urls,
|
|
config=run_config
|
|
)
|
|
for result in results:
|
|
if result.success:
|
|
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
|
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
|
|
|
if current_proxy and ip_match:
|
|
print(f"URL {result.url}")
|
|
print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
|
|
verified = ip_match.group(0) == current_proxy.ip
|
|
if verified:
|
|
print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
|
|
else:
|
|
print("❌ Proxy failed or IP mismatch!")
|
|
print("---")
|
|
|
|
asyncio.run(main())
|
|
|
|
```
|
|
|