Unify proxy_config to accept list, add crawl_stats tracking

- proxy_config on CrawlerRunConfig now accepts a single ProxyConfig or
  a list of ProxyConfig tried in order (first-come-first-served)
- Remove is_fallback from ProxyConfig and fallback_proxy_configs from
  CrawlerRunConfig — proxy escalation handled entirely by list order
- Add _get_proxy_list() normalizer for the retry loop
- Add CrawlResult.crawl_stats with attempts, retries, proxies_used,
  fallback_fetch_used, and resolved_by for billing and observability
- Set success=False with error_message when all attempts are blocked
- Simplify retry loop — no more is_fallback stashing logic
- Update docs and tests to reflect new API
This commit is contained in:
unclecode
2026-02-14 07:53:46 +00:00
parent 72b546c48d
commit 875207287e
6 changed files with 141 additions and 115 deletions

View File

@@ -353,7 +353,6 @@ class ProxyConfig:
username: Optional[str] = None, username: Optional[str] = None,
password: Optional[str] = None, password: Optional[str] = None,
ip: Optional[str] = None, ip: Optional[str] = None,
is_fallback: bool = False,
): ):
"""Configuration class for a single proxy. """Configuration class for a single proxy.
@@ -362,13 +361,10 @@ class ProxyConfig:
username: Optional username for proxy authentication username: Optional username for proxy authentication
password: Optional password for proxy authentication password: Optional password for proxy authentication
ip: Optional IP address for verification purposes ip: Optional IP address for verification purposes
is_fallback: If True, proxy is only used when anti-bot blocking is
detected. If False (default), proxy is used on every request.
""" """
self.server = server self.server = server
self.username = username self.username = username
self.password = password self.password = password
self.is_fallback = is_fallback
# Extract IP from server if not explicitly provided # Extract IP from server if not explicitly provided
self.ip = ip or self._extract_ip_from_server() self.ip = ip or self._extract_ip_from_server()
@@ -430,7 +426,6 @@ class ProxyConfig:
username=proxy_dict.get("username"), username=proxy_dict.get("username"),
password=proxy_dict.get("password"), password=proxy_dict.get("password"),
ip=proxy_dict.get("ip"), ip=proxy_dict.get("ip"),
is_fallback=proxy_dict.get("is_fallback", False),
) )
@staticmethod @staticmethod
@@ -461,7 +456,6 @@ class ProxyConfig:
"username": self.username, "username": self.username,
"password": self.password, "password": self.password,
"ip": self.ip, "ip": self.ip,
"is_fallback": self.is_fallback,
} }
def clone(self, **kwargs) -> "ProxyConfig": def clone(self, **kwargs) -> "ProxyConfig":
@@ -1379,7 +1373,7 @@ class CrawlerRunConfig():
prettiify: bool = False, prettiify: bool = False,
parser_type: str = "lxml", parser_type: str = "lxml",
scraping_strategy: ContentScrapingStrategy = None, scraping_strategy: ContentScrapingStrategy = None,
proxy_config: Union[ProxyConfig, dict, None] = None, proxy_config: Union["ProxyConfig", List["ProxyConfig"], dict, str, None] = None,
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None, proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
# Sticky Proxy Session Parameters # Sticky Proxy Session Parameters
proxy_session_id: Optional[str] = None, proxy_session_id: Optional[str] = None,
@@ -1478,7 +1472,6 @@ class CrawlerRunConfig():
experimental: Dict[str, Any] = None, experimental: Dict[str, Any] = None,
# Anti-Bot Retry Parameters # Anti-Bot Retry Parameters
max_retries: int = 0, max_retries: int = 0,
fallback_proxy_configs: Optional[List["ProxyConfig"]] = None,
fallback_fetch_function: Optional[Callable[[str], Awaitable[str]]] = None, fallback_fetch_function: Optional[Callable[[str], Awaitable[str]]] = None,
): ):
# TODO: Planning to set properties dynamically based on the __init__ signature # TODO: Planning to set properties dynamically based on the __init__ signature
@@ -1501,11 +1494,23 @@ class CrawlerRunConfig():
self.prettiify = prettiify self.prettiify = prettiify
self.parser_type = parser_type self.parser_type = parser_type
self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy() self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy()
self.proxy_config = proxy_config # Normalize proxy_config: single ProxyConfig stored as-is, list stored as list
if isinstance(proxy_config, dict): if isinstance(proxy_config, list):
normalized = []
for p in proxy_config:
if isinstance(p, dict):
normalized.append(ProxyConfig.from_dict(p))
elif isinstance(p, str):
normalized.append(ProxyConfig.from_string(p))
else:
normalized.append(p)
self.proxy_config = normalized
elif isinstance(proxy_config, dict):
self.proxy_config = ProxyConfig.from_dict(proxy_config) self.proxy_config = ProxyConfig.from_dict(proxy_config)
if isinstance(proxy_config, str): elif isinstance(proxy_config, str):
self.proxy_config = ProxyConfig.from_string(proxy_config) self.proxy_config = ProxyConfig.from_string(proxy_config)
else:
self.proxy_config = proxy_config # ProxyConfig or None
self.proxy_rotation_strategy = proxy_rotation_strategy self.proxy_rotation_strategy = proxy_rotation_strategy
@@ -1665,7 +1670,6 @@ class CrawlerRunConfig():
# Anti-Bot Retry Parameters # Anti-Bot Retry Parameters
self.max_retries = max_retries self.max_retries = max_retries
self.fallback_proxy_configs = fallback_proxy_configs or []
self.fallback_fetch_function = fallback_fetch_function self.fallback_fetch_function = fallback_fetch_function
# Compile C4A scripts if provided # Compile C4A scripts if provided
@@ -1673,6 +1677,14 @@ class CrawlerRunConfig():
self._compile_c4a_script() self._compile_c4a_script()
def _get_proxy_list(self) -> list:
"""Normalize proxy_config to a list for the retry loop."""
if self.proxy_config is None:
return [None]
if isinstance(self.proxy_config, list):
return self.proxy_config if self.proxy_config else [None]
return [self.proxy_config]
def _compile_c4a_script(self): def _compile_c4a_script(self):
"""Compile C4A script to JavaScript""" """Compile C4A script to JavaScript"""
try: try:
@@ -1828,7 +1840,11 @@ class CrawlerRunConfig():
"prettiify": self.prettiify, "prettiify": self.prettiify,
"parser_type": self.parser_type, "parser_type": self.parser_type,
"scraping_strategy": self.scraping_strategy, "scraping_strategy": self.scraping_strategy,
"proxy_config": self.proxy_config.to_dict() if hasattr(self.proxy_config, 'to_dict') else self.proxy_config, "proxy_config": (
[p.to_dict() if hasattr(p, 'to_dict') else p for p in self.proxy_config]
if isinstance(self.proxy_config, list)
else (self.proxy_config.to_dict() if hasattr(self.proxy_config, 'to_dict') else self.proxy_config)
),
"proxy_rotation_strategy": self.proxy_rotation_strategy, "proxy_rotation_strategy": self.proxy_rotation_strategy,
"proxy_session_id": self.proxy_session_id, "proxy_session_id": self.proxy_session_id,
"proxy_session_ttl": self.proxy_session_ttl, "proxy_session_ttl": self.proxy_session_ttl,
@@ -1903,7 +1919,6 @@ class CrawlerRunConfig():
"match_mode": self.match_mode, "match_mode": self.match_mode,
"experimental": self.experimental, "experimental": self.experimental,
"max_retries": self.max_retries, "max_retries": self.max_retries,
"fallback_proxy_configs": [p.to_dict() for p in self.fallback_proxy_configs] if self.fallback_proxy_configs else [],
} }
def clone(self, **kwargs): def clone(self, **kwargs):

View File

@@ -393,24 +393,26 @@ class AsyncWebCrawler:
) )
# --- Anti-bot retry setup --- # --- Anti-bot retry setup ---
_fallback_proxy = None
if (config.proxy_config
and getattr(config.proxy_config, "is_fallback", False)):
_fallback_proxy = config.proxy_config
config.proxy_config = None
_max_attempts = 1 + getattr(config, "max_retries", 0) _max_attempts = 1 + getattr(config, "max_retries", 0)
_fallback_proxies = getattr(config, "fallback_proxy_configs", None) or [] _proxy_list = config._get_proxy_list()
_proxy_activated = False _original_proxy_config = config.proxy_config
_block_reason = "" _block_reason = ""
_done = False _done = False
crawl_result = None crawl_result = None
_crawl_stats = {
"attempts": 0,
"retries": 0,
"proxies_used": [],
"fallback_fetch_used": False,
"resolved_by": None,
}
for _attempt in range(_max_attempts): for _attempt in range(_max_attempts):
if _done: if _done:
break break
if _attempt > 0: if _attempt > 0:
_crawl_stats["retries"] = _attempt
self.logger.warning( self.logger.warning(
message="Anti-bot retry {attempt}/{max_retries} for {url}{reason}", message="Anti-bot retry {attempt}/{max_retries} for {url}{reason}",
tag="ANTIBOT", tag="ANTIBOT",
@@ -421,38 +423,22 @@ class AsyncWebCrawler:
"reason": _block_reason, "reason": _block_reason,
}, },
) )
# Activate is_fallback proxy on first retry
if _fallback_proxy and not _proxy_activated:
config.proxy_config = _fallback_proxy
_proxy_activated = True
self.logger.info(
message="Activating fallback proxy: {proxy}",
tag="ANTIBOT",
params={"proxy": _fallback_proxy.server},
)
# Build list of proxies to try this round: for _p_idx, _proxy in enumerate(_proxy_list):
# current config.proxy_config first, then each fallback proxy if _p_idx > 0 or _attempt > 0:
_proxies_this_round = [config.proxy_config] # main (may be None)
_proxies_this_round.extend(_fallback_proxies)
for _p_idx, _proxy in enumerate(_proxies_this_round):
_is_fallback_proxy = _p_idx > 0
if _is_fallback_proxy:
self.logger.info( self.logger.info(
message="Trying fallback proxy {idx}/{total}: {proxy}", message="Trying proxy {idx}/{total}: {proxy}",
tag="ANTIBOT", tag="ANTIBOT",
params={ params={
"idx": _p_idx, "idx": _p_idx + 1,
"total": len(_fallback_proxies), "total": len(_proxy_list),
"proxy": _proxy.server, "proxy": _proxy.server if _proxy else "direct",
}, },
) )
# Temporarily set the proxy for this attempt # Set the active proxy for this attempt
_saved_proxy = config.proxy_config config.proxy_config = _proxy
if _is_fallback_proxy: _crawl_stats["attempts"] += 1
config.proxy_config = _proxy
try: try:
t1 = time.perf_counter() t1 = time.perf_counter()
@@ -507,27 +493,38 @@ class AsyncWebCrawler:
# Check if blocked # Check if blocked
_blocked, _block_reason = is_blocked( _blocked, _block_reason = is_blocked(
async_response.status_code, html) async_response.status_code, html)
_crawl_stats["proxies_used"].append({
"proxy": _proxy.server if _proxy else None,
"status_code": async_response.status_code,
"blocked": _blocked,
"reason": _block_reason if _blocked else "",
})
if not _blocked: if not _blocked:
_crawl_stats["resolved_by"] = "proxy" if _proxy else "direct"
_done = True _done = True
break # Success — exit proxy loop break # Success — exit proxy loop
except Exception as _crawl_err: except Exception as _crawl_err:
if _is_fallback_proxy: _crawl_stats["proxies_used"].append({
"proxy": _proxy.server if _proxy else None,
"status_code": None,
"blocked": True,
"reason": str(_crawl_err),
})
if _p_idx > 0 or _attempt > 0:
self.logger.error_status( self.logger.error_status(
url=url, url=url,
error=f"Fallback proxy {_proxy.server} failed: {_crawl_err}", error=f"Proxy {_proxy.server if _proxy else 'direct'} failed: {_crawl_err}",
tag="ANTIBOT", tag="ANTIBOT",
) )
_block_reason = str(_crawl_err) _block_reason = str(_crawl_err)
else: else:
raise # Let main proxy errors propagate normally raise # First attempt on first proxy propagates normally
finally:
if _is_fallback_proxy:
config.proxy_config = _saved_proxy
# --- Restore stashed is_fallback proxy for config integrity --- # Restore original proxy_config
if _fallback_proxy and not _proxy_activated: config.proxy_config = _original_proxy_config
config.proxy_config = _fallback_proxy
# --- Fallback fetch function (last resort after all retries+proxies exhausted) --- # --- Fallback fetch function (last resort after all retries+proxies exhausted) ---
if (crawl_result if (crawl_result
@@ -540,6 +537,7 @@ class AsyncWebCrawler:
tag="ANTIBOT", tag="ANTIBOT",
params={"url": url[:80]}, params={"url": url[:80]},
) )
_crawl_stats["fallback_fetch_used"] = True
try: try:
_fallback_html = await config.fallback_fetch_function(url) _fallback_html = await config.fallback_fetch_function(url)
if _fallback_html: if _fallback_html:
@@ -560,6 +558,7 @@ class AsyncWebCrawler:
crawl_result.status_code = 200 crawl_result.status_code = 200
crawl_result.session_id = getattr(config, "session_id", None) crawl_result.session_id = getattr(config, "session_id", None)
crawl_result.cache_status = "miss" crawl_result.cache_status = "miss"
_crawl_stats["resolved_by"] = "fallback_fetch"
except Exception as _fallback_err: except Exception as _fallback_err:
self.logger.error_status( self.logger.error_status(
url=url, url=url,
@@ -567,6 +566,15 @@ class AsyncWebCrawler:
tag="ANTIBOT", tag="ANTIBOT",
) )
# --- Mark blocked results as failed ---
if crawl_result:
_blocked, _block_reason = is_blocked(
crawl_result.status_code, crawl_result.html or "")
if _blocked:
crawl_result.success = False
crawl_result.error_message = f"Blocked by anti-bot protection: {_block_reason}"
crawl_result.crawl_stats = _crawl_stats
# Compute head fingerprint for cache validation # Compute head fingerprint for cache validation
if crawl_result and crawl_result.html: if crawl_result and crawl_result.html:
head_end = crawl_result.html.lower().find('</head>') head_end = crawl_result.html.lower().find('</head>')

View File

@@ -157,6 +157,8 @@ class CrawlResult(BaseModel):
head_fingerprint: Optional[str] = None head_fingerprint: Optional[str] = None
cached_at: Optional[float] = None cached_at: Optional[float] = None
cache_status: Optional[str] = None # "hit", "hit_validated", "hit_fallback", "miss" cache_status: Optional[str] = None # "hit", "hit_validated", "hit_fallback", "miss"
# Anti-bot retry/proxy usage stats
crawl_stats: Optional[Dict[str, Any]] = None
model_config = ConfigDict(arbitrary_types_allowed=True) model_config = ConfigDict(arbitrary_types_allowed=True)

View File

@@ -13,39 +13,53 @@ After each crawl attempt, Crawl4AI inspects the HTTP status code and HTML conten
Detection uses structural HTML markers (specific element IDs, script sources, form actions) rather than generic keywords to minimize false positives. A normal page that happens to mention "CAPTCHA" or "Cloudflare" in its content will not be flagged. Detection uses structural HTML markers (specific element IDs, script sources, form actions) rather than generic keywords to minimize false positives. A normal page that happens to mention "CAPTCHA" or "Cloudflare" in its content will not be flagged.
When all attempts fail and blocking is still detected, the result is returned with `success=False` and `error_message` describing the block reason.
## Configuration Options ## Configuration Options
All anti-bot retry options live on `CrawlerRunConfig`: All anti-bot retry options live on `CrawlerRunConfig`:
| Parameter | Type | Default | Description | | Parameter | Type | Default | Description |
|---|---|---|---| |---|---|---|---|
| `proxy_config` | `ProxyConfig`, `list[ProxyConfig]`, or `None` | `None` | Single proxy or ordered list of proxies to try. Each retry round iterates through the full list. |
| `max_retries` | `int` | `0` | Number of retry rounds when blocking is detected. `0` = no retries. | | `max_retries` | `int` | `0` | Number of retry rounds when blocking is detected. `0` = no retries. |
| `fallback_proxy_configs` | `list[ProxyConfig]` | `[]` | List of fallback proxies tried in order within each retry round. |
| `fallback_fetch_function` | `async (str) -> str` | `None` | Async function called as last resort. Takes URL, returns raw HTML. | | `fallback_fetch_function` | `async (str) -> str` | `None` | Async function called as last resort. Takes URL, returns raw HTML. |
And on `ProxyConfig`:
| Parameter | Type | Default | Description |
|---|---|---|---|
| `is_fallback` | `bool` | `False` | When `True`, this proxy is skipped on the first attempt and only activated after blocking is detected. |
## Escalation Chain ## Escalation Chain
Each retry round tries the main proxy first, then each fallback proxy in order. If all rounds are exhausted and the page is still blocked, the fallback fetch function is called as a last resort. Each retry round tries every proxy in `proxy_config` in order. If all rounds are exhausted and the page is still blocked, the fallback fetch function is called as a last resort.
``` ```
For each round (1 + max_retries rounds): For each round (1 + max_retries rounds):
1. Try with main proxy_config (or no proxy if is_fallback=True on first round) 1. Try proxy_config[0] (or direct if proxy_config is None)
2. If blocked → try fallback_proxy_configs[0] 2. If blocked → try proxy_config[1]
3. If blocked → try fallback_proxy_configs[1] 3. If blocked → try proxy_config[2]
4. ... continue through all fallback proxies 4. ... continue through all proxies
5. If any attempt succeeds → done 5. If any attempt succeeds → done
If all rounds exhausted and still blocked: If all rounds exhausted and still blocked:
6. Call fallback_fetch_function(url) → process returned HTML 6. Call fallback_fetch_function(url) → process returned HTML
``` ```
Worst-case attempts before the fetch function: `(1 + max_retries) x (1 + len(fallback_proxy_configs))` Worst-case attempts before the fetch function: `(1 + max_retries) x len(proxy_config)`
## Crawl Stats
Every crawl result includes a `crawl_stats` dict with detailed attempt tracking:
```python
result.crawl_stats = {
"attempts": 3, # total browser attempts made
"retries": 1, # retry rounds used (0 = succeeded first round)
"proxies_used": [ # ordered list of every attempt
{"proxy": None, "status_code": 403, "blocked": True, "reason": "Akamai block (Reference #)"},
{"proxy": "proxy.io:8080", "status_code": 403, "blocked": True, "reason": "Akamai block (Reference #)"},
{"proxy": "premium.io:9090", "status_code": 200, "blocked": False, "reason": ""},
],
"fallback_fetch_used": False, # whether fallback_fetch_function was called
"resolved_by": "proxy", # "direct" | "proxy" | "fallback_fetch" | null (all failed)
}
```
## Usage Examples ## Usage Examples
@@ -64,9 +78,9 @@ async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
) )
``` ```
### Proxy as Fallback Only ### Single Proxy
Use `is_fallback=True` to skip the proxy on the first attempt. If the site doesn't block you, no proxy credits are consumed. If it does, the proxy activates on retry. Pass a single `ProxyConfig` — it's used on every attempt. Same behavior as always.
```python ```python
from crawl4ai.async_configs import ProxyConfig from crawl4ai.async_configs import ProxyConfig
@@ -77,24 +91,23 @@ config = CrawlerRunConfig(
server="http://proxy.example.com:8080", server="http://proxy.example.com:8080",
username="user", username="user",
password="pass", password="pass",
is_fallback=True, # Only used when blocking is detected
), ),
) )
``` ```
### Fallback Proxy List ### Proxy List (Escalation)
Try a cheaper proxy first, escalate to a premium proxy if it fails. Both are tried within each retry round. Pass a list of proxies. They're tried in order — first one that works wins. Within each retry round, the entire list is tried again.
```python ```python
config = CrawlerRunConfig( config = CrawlerRunConfig(
max_retries=2, max_retries=1,
proxy_config=ProxyConfig( proxy_config=[
server="http://datacenter-proxy.example.com:8080", ProxyConfig(
username="user", server="http://datacenter-proxy.example.com:8080",
password="pass", username="user",
), password="pass",
fallback_proxy_configs=[ ),
ProxyConfig( ProxyConfig(
server="http://residential-proxy.example.com:9090", server="http://residential-proxy.example.com:9090",
username="user", username="user",
@@ -104,7 +117,7 @@ config = CrawlerRunConfig(
) )
``` ```
With this setup, each round tries the datacenter proxy first, then the residential proxy. With `max_retries=2`, worst case is 3 rounds x 2 proxies = 6 attempts. With this setup, each round tries the datacenter proxy first, then the residential proxy. With `max_retries=1`, worst case is 2 rounds x 2 proxies = 4 attempts.
### Fallback Fetch Function ### Fallback Fetch Function
@@ -137,7 +150,7 @@ The function can do anything — call an API, read from a database, return cache
### Full Escalation (All Features Combined) ### Full Escalation (All Features Combined)
This example combines every layer: stealth mode, a fallback proxy that only activates when blocked, a list of escalation proxies tried each round, retries, and a final fetch function. This example combines every layer: stealth mode, a list of proxies tried in order, retries, and a final fetch function.
```python ```python
import aiohttp import aiohttp
@@ -164,16 +177,13 @@ crawl_config = CrawlerRunConfig(
wait_until="load", wait_until="load",
max_retries=2, max_retries=2,
# Primary proxy — is_fallback=True means first attempt runs without it # Proxies tried in order — cheapest first
proxy_config=ProxyConfig( proxy_config=[
server="http://datacenter-proxy.example.com:8080", ProxyConfig(
username="user", server="http://datacenter-proxy.example.com:8080",
password="pass", username="user",
is_fallback=True, password="pass",
), ),
# Fallback proxies — tried in order after main proxy fails each round
fallback_proxy_configs=[
ProxyConfig( ProxyConfig(
server="http://residential-proxy.example.com:9090", server="http://residential-proxy.example.com:9090",
username="user", username="user",
@@ -193,6 +203,8 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
if result.success: if result.success:
print(f"Got {len(result.markdown.raw_markdown)} chars of markdown") print(f"Got {len(result.markdown.raw_markdown)} chars of markdown")
print(f"Resolved by: {result.crawl_stats['resolved_by']}")
print(f"Attempts: {result.crawl_stats['attempts']}")
else: else:
print(f"All attempts failed: {result.error_message}") print(f"All attempts failed: {result.error_message}")
``` ```
@@ -201,12 +213,12 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
| Round | Attempt | What runs | | Round | Attempt | What runs |
|---|---|---| |---|---|---|
| 1 | 1 | No proxy (is_fallback skips it) — blocked | | 1 | 1 | Datacenter proxy — blocked |
| 1 | 2 | Residential fallback proxy — blocked (bad IP) | | 1 | 2 | Residential proxy — blocked |
| 2 | 1 | Datacenter proxy activated — blocked | | 2 | 1 | Datacenter proxy — blocked |
| 2 | 2 | Residential fallback proxy — blocked | | 2 | 2 | Residential proxy — blocked |
| 3 | 1 | Datacenter proxy — blocked | | 3 | 1 | Datacenter proxy — blocked |
| 3 | 2 | Residential fallback proxy — blocked | | 3 | 2 | Residential proxy — blocked |
| - | - | `external_fetch(url)` called — returns HTML | | - | - | `external_fetch(url)` called — returns HTML |
That's up to 6 browser attempts + 1 function call before giving up. That's up to 6 browser attempts + 1 function call before giving up.
@@ -214,19 +226,10 @@ That's up to 6 browser attempts + 1 function call before giving up.
## Tips ## Tips
- **Start with `max_retries=0`** and a `fallback_fetch_function` if you just want a safety net without burning time on retries. - **Start with `max_retries=0`** and a `fallback_fetch_function` if you just want a safety net without burning time on retries.
- **Use `is_fallback=True`** on your proxy to avoid consuming proxy credits on sites that don't need them. - **Order proxies cheapest-first** — datacenter proxies before residential, residential before premium.
- **Order fallback proxies cheapest-first** — datacenter proxies before residential, residential before premium.
- **Combine with stealth mode** — `BrowserConfig(enable_stealth=True)` and `CrawlerRunConfig(magic=True)` reduce the chance of being blocked in the first place. - **Combine with stealth mode** — `BrowserConfig(enable_stealth=True)` and `CrawlerRunConfig(magic=True)` reduce the chance of being blocked in the first place.
- **`wait_until="load"`** is important for anti-bot sites — the default `domcontentloaded` can return before the anti-bot sensor finishes. - **`wait_until="load"`** is important for anti-bot sites — the default `domcontentloaded` can return before the anti-bot sensor finishes.
- **You don't need a primary proxy to use fallback proxies.** If you skip `proxy_config` and only pass `fallback_proxy_configs`, the first attempt each round runs with no proxy. This is useful when you want to try direct access first and only escalate to proxies if blocked: - **Check `crawl_stats`** to understand what happened — how many attempts, which proxy worked, whether the fallback function was needed.
```python
config = CrawlerRunConfig(
max_retries=1,
fallback_proxy_configs=[proxy_A, proxy_B],
)
# Round 1: no proxy → proxy_A → proxy_B
# Round 2: no proxy → proxy_A → proxy_B
```
## See Also ## See Also

View File

@@ -109,10 +109,9 @@ We group them by category.
| **`timezone_id`** | `str or None` (None) | Browser's timezone (e.g., "America/New_York", "Europe/Paris"). | | **`timezone_id`** | `str or None` (None) | Browser's timezone (e.g., "America/New_York", "Europe/Paris"). |
| **`geolocation`** | `GeolocationConfig or None` (None) | GPS coordinates configuration. Use `GeolocationConfig(latitude=..., longitude=..., accuracy=...)`. | | **`geolocation`** | `GeolocationConfig or None` (None) | GPS coordinates configuration. Use `GeolocationConfig(latitude=..., longitude=..., accuracy=...)`. |
| **`fetch_ssl_certificate`** | `bool` (False) | If `True`, fetches and includes SSL certificate information in the result. | | **`fetch_ssl_certificate`** | `bool` (False) | If `True`, fetches and includes SSL certificate information in the result. |
| **`proxy_config`** | `ProxyConfig or dict or None` (None) | Proxy configuration for this specific crawl. Can override browser-level proxy settings. Set `is_fallback=True` on the ProxyConfig to only use the proxy when anti-bot blocking is detected. | | **`proxy_config`** | `ProxyConfig`, `list[ProxyConfig]`, or `None` (None) | Proxy configuration for this specific crawl. Pass a single proxy or an ordered list of proxies to try. See [Anti-Bot & Fallback](../advanced/anti-bot-and-fallback.md). |
| **`proxy_rotation_strategy`** | `ProxyRotationStrategy` (None) | Strategy for rotating proxies during crawl operations. | | **`proxy_rotation_strategy`** | `ProxyRotationStrategy` (None) | Strategy for rotating proxies during crawl operations. |
| **`max_retries`** | `int` (0) | Number of retry rounds when anti-bot blocking is detected. Each round tries the main proxy and all fallback proxies. | | **`max_retries`** | `int` (0) | Number of retry rounds when anti-bot blocking is detected. Each round tries all proxies in `proxy_config`. |
| **`fallback_proxy_configs`** | `list[ProxyConfig]` ([]) | List of fallback proxies tried in order within each retry round after the main proxy fails. |
| **`fallback_fetch_function`**| `async (str) -> str or None` (None) | Async function called as last resort after all retries are exhausted. Takes URL, returns raw HTML. See [Anti-Bot & Fallback](../advanced/anti-bot-and-fallback.md). | | **`fallback_fetch_function`**| `async (str) -> str or None` (None) | Async function called as last resort after all retries are exhausted. Takes URL, returns raw HTML. See [Anti-Bot & Fallback](../advanced/anti-bot-and-fallback.md). |
--- ---

View File

@@ -276,12 +276,11 @@ class CrawlerRunConfig:
- See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control) - See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control)
10.**Proxy Configuration**: 10.**Proxy Configuration**:
- **`proxy_config`**: Proxy server configuration (ProxyConfig object or dict) e.g. {"server": "...", "username": "...", "password"}. Set `is_fallback=True` to only use the proxy when anti-bot blocking is detected. - **`proxy_config`**: Single `ProxyConfig` or `list[ProxyConfig]` — proxies tried in order. Pass a list for automatic escalation.
- **`proxy_rotation_strategy`**: Strategy for rotating proxies during crawls - **`proxy_rotation_strategy`**: Strategy for rotating proxies during crawls
11.**Anti-Bot Retry & Fallback** (see [Anti-Bot & Fallback](../advanced/anti-bot-and-fallback.md)): 11.**Anti-Bot Retry & Fallback** (see [Anti-Bot & Fallback](../advanced/anti-bot-and-fallback.md)):
- **`max_retries`**: Number of retry rounds when blocking is detected (default: 0) - **`max_retries`**: Number of retry rounds when blocking is detected (default: 0). Each round tries all proxies in `proxy_config`.
- **`fallback_proxy_configs`**: List of fallback proxies tried in order within each retry round
- **`fallback_fetch_function`**: Async function called as last resort — takes URL, returns raw HTML - **`fallback_fetch_function`**: Async function called as last resort — takes URL, returns raw HTML
12.**Page Interaction Parameters**: 12.**Page Interaction Parameters**: