diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index ff02eda6..e559fc5b 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -347,6 +347,8 @@ class GeolocationConfig: return GeolocationConfig.from_dict(config_dict) class ProxyConfig: + DIRECT = "direct" # Sentinel: use in proxy_config list to mean "no proxy" + def __init__( self, server: str, @@ -1498,7 +1500,9 @@ class CrawlerRunConfig(): if isinstance(proxy_config, list): normalized = [] for p in proxy_config: - if isinstance(p, dict): + if p is None or p == "direct": + normalized.append(None) + elif isinstance(p, dict): normalized.append(ProxyConfig.from_dict(p)) elif isinstance(p, str): normalized.append(ProxyConfig.from_string(p)) diff --git a/docs/md_v2/advanced/anti-bot-and-fallback.md b/docs/md_v2/advanced/anti-bot-and-fallback.md index 3fa7988b..68ed03de 100644 --- a/docs/md_v2/advanced/anti-bot-and-fallback.md +++ b/docs/md_v2/advanced/anti-bot-and-fallback.md @@ -21,7 +21,7 @@ All anti-bot retry options live on `CrawlerRunConfig`: | Parameter | Type | Default | Description | |---|---|---|---| -| `proxy_config` | `ProxyConfig`, `list[ProxyConfig]`, or `None` | `None` | Single proxy or ordered list of proxies to try. Each retry round iterates through the full list. | +| `proxy_config` | `ProxyConfig`, `list[ProxyConfig]`, or `None` | `None` | Single proxy or ordered list of proxies to try. Each retry round iterates through the full list. Use `"direct"` or `ProxyConfig.DIRECT` in a list to explicitly try without a proxy. | | `max_retries` | `int` | `0` | Number of retry rounds when blocking is detected. `0` = no retries. | | `fallback_fetch_function` | `async (str) -> str` | `None` | Async function called as last resort. Takes URL, returns raw HTML. | @@ -95,6 +95,31 @@ config = CrawlerRunConfig( ) ``` +### Direct-First, Then Proxies + +Try without a proxy first, then escalate to proxies if blocked. Use `ProxyConfig.DIRECT` (or the string `"direct"`) in the list to represent a no-proxy attempt. + +```python +config = CrawlerRunConfig( + max_retries=1, + proxy_config=[ + ProxyConfig.DIRECT, # Try without proxy first + ProxyConfig( + server="http://datacenter-proxy.example.com:8080", + username="user", + password="pass", + ), + ProxyConfig( + server="http://residential-proxy.example.com:9090", + username="user", + password="pass", + ), + ], +) +``` + +With this setup, each round tries direct first, then datacenter, then residential. With `max_retries=1`, worst case is 2 rounds x 3 steps = 6 attempts. + ### Proxy List (Escalation) Pass a list of proxies. They're tried in order — first one that works wins. Within each retry round, the entire list is tried again.