From 263ac890fd2be83cd4b343fc7b594f453fef9f36 Mon Sep 17 00:00:00 2001 From: AHMET YILMAZ Date: Mon, 10 Nov 2025 11:42:07 +0800 Subject: [PATCH 1/2] #1591 : Enhance proxy configuration documentation with security features, SSL analysis, and improved examples --- docs/md_v2/advanced/proxy-security.md | 359 +++++++++++++++++++++----- 1 file changed, 297 insertions(+), 62 deletions(-) diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md index d1c868b2..accfb25a 100644 --- a/docs/md_v2/advanced/proxy-security.md +++ b/docs/md_v2/advanced/proxy-security.md @@ -1,98 +1,333 @@ -# Proxy +# Proxy & Security + +This guide covers proxy configuration and security features in Crawl4AI, including SSL certificate analysis and proxy rotation strategies. + +## Understanding Proxy Configuration + +Crawl4AI supports proxy configuration at two levels: + +### BrowserConfig.proxy_config +Sets proxy at the **browser level** - affects all pages/tabs in that browser instance. Use this when: +- You want all crawls from this browser to use the same proxy +- You're using a single proxy for the entire session +- You need persistent proxy settings across multiple crawls + +### CrawlerRunConfig.proxy_config +Sets proxy at the **request level** - can be different for each crawl operation. Use this when: +- You want per-request proxy control +- You're implementing proxy rotation +- Different URLs need different proxies ## Basic Proxy Setup -Simple proxy configuration with `BrowserConfig`: +### Browser-Level Proxy (BrowserConfig) + +Configure proxies that apply to the entire browser session: ```python -from crawl4ai.async_configs import BrowserConfig - -# Using HTTP proxy -browser_config = BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080"}) -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com") - -# Using SOCKS proxy -browser_config = BrowserConfig(proxy_config={"server": "socks5://proxy.example.com:1080"}) -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com") -``` - -## Authenticated Proxy - -Use an authenticated proxy with `BrowserConfig`: - -```python -from crawl4ai.async_configs import BrowserConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig +# Using dictionary configuration browser_config = BrowserConfig(proxy_config={ - "server": "http://[host]:[port]", - "username": "[username]", - "password": "[password]", + "server": "http://proxy.example.com:8080" }) + +# Using ProxyConfig object +from crawl4ai import ProxyConfig +proxy = ProxyConfig(server="http://proxy.example.com:8080") +browser_config = BrowserConfig(proxy_config=proxy) + +# Using string (auto-parsed) +browser_config = BrowserConfig(proxy_config="http://proxy.example.com:8080") + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url="https://example.com") ``` +### Request-Level Proxy (CrawlerRunConfig) -## Rotating Proxies - -Example using a proxy rotation service dynamically: +Configure proxies that can be customized per crawl operation: ```python -import re -from crawl4ai import ( - AsyncWebCrawler, - BrowserConfig, - CrawlerRunConfig, - CacheMode, - RoundRobinProxyStrategy, -) -import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +# Using dictionary configuration +run_config = CrawlerRunConfig(proxy_config={ + "server": "http://proxy.example.com:8080" +}) + +# Using ProxyConfig object from crawl4ai import ProxyConfig +proxy = ProxyConfig(server="http://proxy.example.com:8080") +run_config = CrawlerRunConfig(proxy_config=proxy) + +# Using string (auto-parsed) +run_config = CrawlerRunConfig(proxy_config="http://proxy.example.com:8080") + +browser_config = BrowserConfig() +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", config=run_config) +``` + +!!! note "Priority Order" + When both `BrowserConfig.proxy_config` and `CrawlerRunConfig.proxy_config` are set, `CrawlerRunConfig.proxy_config` takes precedence for that specific crawl operation. + +## Supported Proxy Formats + +The `ProxyConfig.from_string()` method supports multiple formats: + +```python +from crawl4ai import ProxyConfig + +# HTTP proxy with authentication +proxy1 = ProxyConfig.from_string("http://user:pass@192.168.1.1:8080") + +# HTTPS proxy +proxy2 = ProxyConfig.from_string("https://proxy.example.com:8080") + +# SOCKS5 proxy +proxy3 = ProxyConfig.from_string("socks5://proxy.example.com:1080") + +# Simple IP:port format +proxy4 = ProxyConfig.from_string("192.168.1.1:8080") + +# IP:port:user:pass format +proxy5 = ProxyConfig.from_string("192.168.1.1:8080:user:pass") +``` + +## Authenticated Proxies + +For proxies requiring authentication: + +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +# Using dictionary +run_config = CrawlerRunConfig(proxy_config={ + "server": "http://proxy.example.com:8080", + "username": "your_username", + "password": "your_password" +}) + +# Using ProxyConfig object +from crawl4ai import ProxyConfig +proxy = ProxyConfig( + server="http://proxy.example.com:8080", + username="your_username", + password="your_password" +) +run_config = CrawlerRunConfig(proxy_config=proxy) + +browser_config = BrowserConfig() +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", config=run_config) +``` + +## Environment Variable Configuration + +Load proxies from environment variables for easy configuration: + +```python +import os +from crawl4ai import ProxyConfig, CrawlerRunConfig + +# Set environment variable +os.environ["PROXIES"] = "ip1:port1:user1:pass1,ip2:port2:user2:pass2,ip3:port3" + +# Load all proxies +proxies = ProxyConfig.from_env() +print(f"Loaded {len(proxies)} proxies") + +# Use first proxy +if proxies: + run_config = CrawlerRunConfig(proxy_config=proxies[0]) +``` + +## Rotating Proxies + +Crawl4AI supports automatic proxy rotation to distribute requests across multiple proxy servers. Rotation is applied per request using a rotation strategy on `CrawlerRunConfig`. + +### Proxy Rotation (recommended) +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, ProxyConfig +from crawl4ai.proxy_strategy import RoundRobinProxyStrategy +import re + async def main(): - # Load proxies and create rotation strategy + # Load proxies from environment proxies = ProxyConfig.from_env() - #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2" if not proxies: - print("No proxies found in environment. Set PROXIES env variable!") + print("No proxies found! Set PROXIES environment variable.") return + # Create rotation strategy proxy_strategy = RoundRobinProxyStrategy(proxies) - # Create configs + # Configure per-request with proxy rotation browser_config = BrowserConfig(headless=True, verbose=False) run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, - proxy_rotation_strategy=proxy_strategy + proxy_rotation_strategy=proxy_strategy, ) async with AsyncWebCrawler(config=browser_config) as crawler: urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice - print("\nšŸ“ˆ Initializing crawler with proxy rotation...") - async with AsyncWebCrawler(config=browser_config) as crawler: - print("\nšŸš€ Starting batch crawl with proxy rotation...") - results = await crawler.arun_many( - urls=urls, - config=run_config - ) - for result in results: - if result.success: - ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html) - current_proxy = run_config.proxy_config if run_config.proxy_config else None + print(f"šŸš€ Testing {len(proxies)} proxies with rotation...") + results = await crawler.arun_many(urls=urls, config=run_config) - if current_proxy and ip_match: - print(f"URL {result.url}") - print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}") - verified = ip_match.group(0) == current_proxy.ip - if verified: - print(f"āœ… Proxy working! IP matches: {current_proxy.ip}") - else: - print("āŒ Proxy failed or IP mismatch!") - print("---") + for i, result in enumerate(results): + if result.success: + # Extract IP from response + ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html) + if ip_match: + detected_ip = ip_match.group(0) + proxy_index = i % len(proxies) + expected_ip = proxies[proxy_index].ip + + print(f"āœ… Request {i+1}: Proxy {proxy_index+1} -> IP {detected_ip}") + if detected_ip == expected_ip: + print(" šŸŽÆ IP matches proxy configuration") + else: + print(f" āš ļø IP mismatch (expected {expected_ip})") + else: + print(f"āŒ Request {i+1}: Could not extract IP from response") + else: + print(f"āŒ Request {i+1}: Failed - {result.error_message}") asyncio.run(main()) - ``` +## SSL Certificate Analysis + +Combine proxy usage with SSL certificate inspection for enhanced security analysis. SSL certificate fetching is configured per request via `CrawlerRunConfig`. + +### Per-Request SSL Certificate Analysis +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +# Configure proxy with SSL certificate fetching per request +run_config = CrawlerRunConfig( + proxy_config={ + "server": "http://proxy.example.com:8080", + "username": "user", + "password": "pass" + }, + fetch_ssl_certificate=True # Enable SSL certificate analysis for this request +) + +browser_config = BrowserConfig() +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", config=run_config) + + if result.success: + print(f"āœ… Crawled via proxy: {result.url}") + + # Analyze SSL certificate + if result.ssl_certificate: + cert = result.ssl_certificate + print("šŸ”’ SSL Certificate Info:") + print(f" Issuer: {cert.issuer}") + print(f" Subject: {cert.subject}") + print(f" Valid until: {cert.valid_until}") + print(f" Fingerprint: {cert.fingerprint}") + + # Export certificate + cert.to_json("certificate.json") + print("šŸ’¾ Certificate exported to certificate.json") + else: + print("āš ļø No SSL certificate information available") +``` + +## Security Best Practices + +### 1. Proxy Rotation for Anonymity +```python +# Use multiple proxies to avoid IP blocking +proxies = ProxyConfig.from_env("PROXIES") +strategy = RoundRobinProxyStrategy(proxies) + +# Configure rotation per request (recommended) +run_config = CrawlerRunConfig(proxy_rotation_strategy=strategy) + +# If you want a single static proxy across all requests, set a fixed ProxyConfig at browser-level: +# browser_config = BrowserConfig(proxy_config=proxies[0]) +``` + +### 2. SSL Certificate Verification +```python +# Always verify SSL certificates when possible +# Per-request (affects specific requests) +run_config = CrawlerRunConfig(fetch_ssl_certificate=True) +``` + +### 3. Environment Variable Security +```bash +# Use environment variables for sensitive proxy credentials +# Avoid hardcoding usernames/passwords in code +export PROXIES="ip1:port1:user1:pass1,ip2:port2:user2:pass2" +``` + +### 4. SOCKS5 for Enhanced Security +```python +# Prefer SOCKS5 proxies for better protocol support +# Browser-level +browser_config = BrowserConfig(proxy_config="socks5://proxy.example.com:1080") + +# Or request-level +run_config = CrawlerRunConfig(proxy_config="socks5://proxy.example.com:1080") +``` + +## Migration from Deprecated `proxy` Parameter + +!!! warning "Deprecation Notice" + The `proxy` parameter in `BrowserConfig` is deprecated. Use `proxy_config` in either `BrowserConfig` or `CrawlerRunConfig` instead. + +```python +# Old (deprecated) +browser_config = BrowserConfig(proxy="http://proxy.example.com:8080") + +# You will see a warning similar to: +# DeprecationWarning: BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead. + +# New (recommended) - Browser-level default +browser_config = BrowserConfig(proxy_config="http://proxy.example.com:8080") + +# Or request-level override (takes precedence per request) +run_config = CrawlerRunConfig(proxy_config="http://proxy.example.com:8080") +``` + +### Safe Logging of Proxies +```python +from crawl4ai import ProxyConfig + +def safe_proxy_repr(proxy: ProxyConfig): + if getattr(proxy, "username", None): + return f"{proxy.server} (auth: ****)" + return proxy.server +``` + +## Troubleshooting + +### Common Issues + +1. **Proxy Connection Failed** + - Verify proxy server is accessible + - Check authentication credentials + - Ensure correct protocol (http/https/socks5) + +2. **SSL Certificate Errors** + - Some proxies may interfere with SSL inspection + - Try different proxy or disable SSL verification if necessary + +3. **Environment Variables Not Loading** + - Ensure PROXIES variable is set correctly + - Check comma separation and format: `ip:port:user:pass,ip:port:user:pass` + +4. **Proxy Rotation Not Working** + - Verify proxies are loaded: `len(proxies) > 0` + - Check proxy strategy is set on `CrawlerRunConfig` via `proxy_rotation_strategy` + - Ensure `proxy_config` is a valid `ProxyConfig` (when using a static proxy) + + + From fe353c4e27c1e339f8fb8f5f673ceafda704f680 Mon Sep 17 00:00:00 2001 From: AHMET YILMAZ Date: Thu, 13 Nov 2025 11:20:24 +0800 Subject: [PATCH 2/2] Refactor proxy configuration documentation for clarity and consistency --- docs/md_v2/advanced/proxy-security.md | 231 +++++++++++--------------- 1 file changed, 101 insertions(+), 130 deletions(-) diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md index accfb25a..fc3e1904 100644 --- a/docs/md_v2/advanced/proxy-security.md +++ b/docs/md_v2/advanced/proxy-security.md @@ -4,73 +4,34 @@ This guide covers proxy configuration and security features in Crawl4AI, includi ## Understanding Proxy Configuration -Crawl4AI supports proxy configuration at two levels: - -### BrowserConfig.proxy_config -Sets proxy at the **browser level** - affects all pages/tabs in that browser instance. Use this when: -- You want all crawls from this browser to use the same proxy -- You're using a single proxy for the entire session -- You need persistent proxy settings across multiple crawls - -### CrawlerRunConfig.proxy_config -Sets proxy at the **request level** - can be different for each crawl operation. Use this when: -- You want per-request proxy control -- You're implementing proxy rotation -- Different URLs need different proxies +Crawl4AI recommends configuring proxies per request through `CrawlerRunConfig.proxy_config`. This gives you precise control, enables rotation strategies, and keeps examples simple enough to copy, paste, and run. ## Basic Proxy Setup -### Browser-Level Proxy (BrowserConfig) - -Configure proxies that apply to the entire browser session: +Configure proxies that apply to each crawl operation: ```python -from crawl4ai import AsyncWebCrawler, BrowserConfig +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, ProxyConfig -# Using dictionary configuration -browser_config = BrowserConfig(proxy_config={ - "server": "http://proxy.example.com:8080" -}) +run_config = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://proxy.example.com:8080")) +# run_config = CrawlerRunConfig(proxy_config={"server": "http://proxy.example.com:8080"}) +# run_config = CrawlerRunConfig(proxy_config="http://proxy.example.com:8080") -# Using ProxyConfig object -from crawl4ai import ProxyConfig -proxy = ProxyConfig(server="http://proxy.example.com:8080") -browser_config = BrowserConfig(proxy_config=proxy) -# Using string (auto-parsed) -browser_config = BrowserConfig(proxy_config="http://proxy.example.com:8080") +async def main(): + browser_config = BrowserConfig() + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", config=run_config) + print(f"Success: {result.success} -> {result.url}") -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com") + +if __name__ == "__main__": + asyncio.run(main()) ``` -### Request-Level Proxy (CrawlerRunConfig) - -Configure proxies that can be customized per crawl operation: - -```python -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig - -# Using dictionary configuration -run_config = CrawlerRunConfig(proxy_config={ - "server": "http://proxy.example.com:8080" -}) - -# Using ProxyConfig object -from crawl4ai import ProxyConfig -proxy = ProxyConfig(server="http://proxy.example.com:8080") -run_config = CrawlerRunConfig(proxy_config=proxy) - -# Using string (auto-parsed) -run_config = CrawlerRunConfig(proxy_config="http://proxy.example.com:8080") - -browser_config = BrowserConfig() -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com", config=run_config) -``` - -!!! note "Priority Order" - When both `BrowserConfig.proxy_config` and `CrawlerRunConfig.proxy_config` are set, `CrawlerRunConfig.proxy_config` takes precedence for that specific crawl operation. +!!! note "Why request-level?" + `CrawlerRunConfig.proxy_config` keeps each request self-contained, so swapping proxies or rotation strategies is just a matter of building a new run configuration. ## Supported Proxy Formats @@ -100,27 +61,33 @@ proxy5 = ProxyConfig.from_string("192.168.1.1:8080:user:pass") For proxies requiring authentication: ```python -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +import asyncio +from crawl4ai import AsyncWebCrawler,BrowserConfig, CrawlerRunConfig, ProxyConfig -# Using dictionary -run_config = CrawlerRunConfig(proxy_config={ - "server": "http://proxy.example.com:8080", - "username": "your_username", - "password": "your_password" -}) - -# Using ProxyConfig object -from crawl4ai import ProxyConfig -proxy = ProxyConfig( - server="http://proxy.example.com:8080", - username="your_username", - password="your_password" +run_config = CrawlerRunConfig( + proxy_config=ProxyConfig( + server="http://proxy.example.com:8080", + username="your_username", + password="your_password", + ) ) -run_config = CrawlerRunConfig(proxy_config=proxy) +# Or dictionary style: +# run_config = CrawlerRunConfig(proxy_config={ +# "server": "http://proxy.example.com:8080", +# "username": "your_username", +# "password": "your_password", +# }) -browser_config = BrowserConfig() -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com", config=run_config) + +async def main(): + browser_config = BrowserConfig() + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", config=run_config) + print(f"Success: {result.success} -> {result.url}") + + +if __name__ == "__main__": + asyncio.run(main()) ``` ## Environment Variable Configuration @@ -149,9 +116,10 @@ Crawl4AI supports automatic proxy rotation to distribute requests across multipl ### Proxy Rotation (recommended) ```python +import asyncio +import re from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, ProxyConfig from crawl4ai.proxy_strategy import RoundRobinProxyStrategy -import re async def main(): # Load proxies from environment @@ -195,7 +163,8 @@ async def main(): else: print(f"āŒ Request {i+1}: Failed - {result.error_message}") -asyncio.run(main()) +if __name__ == "__main__": + asyncio.run(main()) ``` ## SSL Certificate Analysis @@ -204,45 +173,54 @@ Combine proxy usage with SSL certificate inspection for enhanced security analys ### Per-Request SSL Certificate Analysis ```python +import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig -# Configure proxy with SSL certificate fetching per request run_config = CrawlerRunConfig( proxy_config={ "server": "http://proxy.example.com:8080", "username": "user", - "password": "pass" + "password": "pass", }, - fetch_ssl_certificate=True # Enable SSL certificate analysis for this request + fetch_ssl_certificate=True, # Enable SSL certificate analysis for this request ) -browser_config = BrowserConfig() -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com", config=run_config) - if result.success: - print(f"āœ… Crawled via proxy: {result.url}") +async def main(): + browser_config = BrowserConfig() + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", config=run_config) - # Analyze SSL certificate - if result.ssl_certificate: - cert = result.ssl_certificate - print("šŸ”’ SSL Certificate Info:") - print(f" Issuer: {cert.issuer}") - print(f" Subject: {cert.subject}") - print(f" Valid until: {cert.valid_until}") - print(f" Fingerprint: {cert.fingerprint}") + if result.success: + print(f"āœ… Crawled via proxy: {result.url}") - # Export certificate - cert.to_json("certificate.json") - print("šŸ’¾ Certificate exported to certificate.json") - else: - print("āš ļø No SSL certificate information available") + # Analyze SSL certificate + if result.ssl_certificate: + cert = result.ssl_certificate + print("šŸ”’ SSL Certificate Info:") + print(f" Issuer: {cert.issuer}") + print(f" Subject: {cert.subject}") + print(f" Valid until: {cert.valid_until}") + print(f" Fingerprint: {cert.fingerprint}") + + # Export certificate + cert.to_json("certificate.json") + print("šŸ’¾ Certificate exported to certificate.json") + else: + print("āš ļø No SSL certificate information available") + + +if __name__ == "__main__": + asyncio.run(main()) ``` ## Security Best Practices ### 1. Proxy Rotation for Anonymity ```python +from crawl4ai import CrawlerRunConfig, ProxyConfig +from crawl4ai.proxy_strategy import RoundRobinProxyStrategy + # Use multiple proxies to avoid IP blocking proxies = ProxyConfig.from_env("PROXIES") strategy = RoundRobinProxyStrategy(proxies) @@ -250,12 +228,14 @@ strategy = RoundRobinProxyStrategy(proxies) # Configure rotation per request (recommended) run_config = CrawlerRunConfig(proxy_rotation_strategy=strategy) -# If you want a single static proxy across all requests, set a fixed ProxyConfig at browser-level: -# browser_config = BrowserConfig(proxy_config=proxies[0]) +# For a fixed proxy across all requests, just reuse the same run_config instance +static_run_config = run_config ``` ### 2. SSL Certificate Verification ```python +from crawl4ai import CrawlerRunConfig + # Always verify SSL certificates when possible # Per-request (affects specific requests) run_config = CrawlerRunConfig(fetch_ssl_certificate=True) @@ -270,30 +250,24 @@ export PROXIES="ip1:port1:user1:pass1,ip2:port2:user2:pass2" ### 4. SOCKS5 for Enhanced Security ```python -# Prefer SOCKS5 proxies for better protocol support -# Browser-level -browser_config = BrowserConfig(proxy_config="socks5://proxy.example.com:1080") +from crawl4ai import CrawlerRunConfig -# Or request-level +# Prefer SOCKS5 proxies for better protocol support run_config = CrawlerRunConfig(proxy_config="socks5://proxy.example.com:1080") ``` ## Migration from Deprecated `proxy` Parameter !!! warning "Deprecation Notice" - The `proxy` parameter in `BrowserConfig` is deprecated. Use `proxy_config` in either `BrowserConfig` or `CrawlerRunConfig` instead. + The legacy `proxy` argument on `BrowserConfig` is deprecated. Configure proxies through `CrawlerRunConfig.proxy_config` so each request fully describes its network settings. ```python -# Old (deprecated) -browser_config = BrowserConfig(proxy="http://proxy.example.com:8080") +# Old (deprecated) approach +# from crawl4ai import BrowserConfig +# browser_config = BrowserConfig(proxy="http://proxy.example.com:8080") -# You will see a warning similar to: -# DeprecationWarning: BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead. - -# New (recommended) - Browser-level default -browser_config = BrowserConfig(proxy_config="http://proxy.example.com:8080") - -# Or request-level override (takes precedence per request) +# New (preferred) approach +from crawl4ai import CrawlerRunConfig run_config = CrawlerRunConfig(proxy_config="http://proxy.example.com:8080") ``` @@ -311,23 +285,20 @@ def safe_proxy_repr(proxy: ProxyConfig): ### Common Issues -1. **Proxy Connection Failed** - - Verify proxy server is accessible - - Check authentication credentials - - Ensure correct protocol (http/https/socks5) +???+ question "Proxy connection failed" + - Verify the proxy server is reachable from your network. + - Double-check authentication credentials. + - Ensure the protocol matches (`http`, `https`, or `socks5`). -2. **SSL Certificate Errors** - - Some proxies may interfere with SSL inspection - - Try different proxy or disable SSL verification if necessary +???+ question "SSL certificate errors" + - Some proxies break SSL inspection; switch proxies if you see repeated failures. + - Consider temporarily disabling certificate fetching to isolate the issue. -3. **Environment Variables Not Loading** - - Ensure PROXIES variable is set correctly - - Check comma separation and format: `ip:port:user:pass,ip:port:user:pass` - -4. **Proxy Rotation Not Working** - - Verify proxies are loaded: `len(proxies) > 0` - - Check proxy strategy is set on `CrawlerRunConfig` via `proxy_rotation_strategy` - - Ensure `proxy_config` is a valid `ProxyConfig` (when using a static proxy) - - +???+ question "Environment variables not loading" + - Confirm `PROXIES` (or your custom env var) is set before running the script. + - Check formatting: `ip:port:user:pass,ip:port:user:pass`. +???+ question "Proxy rotation not working" + - Ensure `ProxyConfig.from_env()` actually loaded entries (`len(proxies) > 0`). + - Attach `proxy_rotation_strategy` to `CrawlerRunConfig`. + - Validate the proxy definitions you pass into the strategy.