Unify proxy_config to accept list, add crawl_stats tracking
- proxy_config on CrawlerRunConfig now accepts a single ProxyConfig or a list of ProxyConfig tried in order (first-come-first-served) - Remove is_fallback from ProxyConfig and fallback_proxy_configs from CrawlerRunConfig — proxy escalation handled entirely by list order - Add _get_proxy_list() normalizer for the retry loop - Add CrawlResult.crawl_stats with attempts, retries, proxies_used, fallback_fetch_used, and resolved_by for billing and observability - Set success=False with error_message when all attempts are blocked - Simplify retry loop — no more is_fallback stashing logic - Update docs and tests to reflect new API
This commit is contained in:
@@ -353,7 +353,6 @@ class ProxyConfig:
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
ip: Optional[str] = None,
|
||||
is_fallback: bool = False,
|
||||
):
|
||||
"""Configuration class for a single proxy.
|
||||
|
||||
@@ -362,13 +361,10 @@ class ProxyConfig:
|
||||
username: Optional username for proxy authentication
|
||||
password: Optional password for proxy authentication
|
||||
ip: Optional IP address for verification purposes
|
||||
is_fallback: If True, proxy is only used when anti-bot blocking is
|
||||
detected. If False (default), proxy is used on every request.
|
||||
"""
|
||||
self.server = server
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.is_fallback = is_fallback
|
||||
|
||||
# Extract IP from server if not explicitly provided
|
||||
self.ip = ip or self._extract_ip_from_server()
|
||||
@@ -430,7 +426,6 @@ class ProxyConfig:
|
||||
username=proxy_dict.get("username"),
|
||||
password=proxy_dict.get("password"),
|
||||
ip=proxy_dict.get("ip"),
|
||||
is_fallback=proxy_dict.get("is_fallback", False),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@@ -461,7 +456,6 @@ class ProxyConfig:
|
||||
"username": self.username,
|
||||
"password": self.password,
|
||||
"ip": self.ip,
|
||||
"is_fallback": self.is_fallback,
|
||||
}
|
||||
|
||||
def clone(self, **kwargs) -> "ProxyConfig":
|
||||
@@ -1379,7 +1373,7 @@ class CrawlerRunConfig():
|
||||
prettiify: bool = False,
|
||||
parser_type: str = "lxml",
|
||||
scraping_strategy: ContentScrapingStrategy = None,
|
||||
proxy_config: Union[ProxyConfig, dict, None] = None,
|
||||
proxy_config: Union["ProxyConfig", List["ProxyConfig"], dict, str, None] = None,
|
||||
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
|
||||
# Sticky Proxy Session Parameters
|
||||
proxy_session_id: Optional[str] = None,
|
||||
@@ -1478,7 +1472,6 @@ class CrawlerRunConfig():
|
||||
experimental: Dict[str, Any] = None,
|
||||
# Anti-Bot Retry Parameters
|
||||
max_retries: int = 0,
|
||||
fallback_proxy_configs: Optional[List["ProxyConfig"]] = None,
|
||||
fallback_fetch_function: Optional[Callable[[str], Awaitable[str]]] = None,
|
||||
):
|
||||
# TODO: Planning to set properties dynamically based on the __init__ signature
|
||||
@@ -1501,11 +1494,23 @@ class CrawlerRunConfig():
|
||||
self.prettiify = prettiify
|
||||
self.parser_type = parser_type
|
||||
self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy()
|
||||
self.proxy_config = proxy_config
|
||||
if isinstance(proxy_config, dict):
|
||||
# Normalize proxy_config: single ProxyConfig stored as-is, list stored as list
|
||||
if isinstance(proxy_config, list):
|
||||
normalized = []
|
||||
for p in proxy_config:
|
||||
if isinstance(p, dict):
|
||||
normalized.append(ProxyConfig.from_dict(p))
|
||||
elif isinstance(p, str):
|
||||
normalized.append(ProxyConfig.from_string(p))
|
||||
else:
|
||||
normalized.append(p)
|
||||
self.proxy_config = normalized
|
||||
elif isinstance(proxy_config, dict):
|
||||
self.proxy_config = ProxyConfig.from_dict(proxy_config)
|
||||
if isinstance(proxy_config, str):
|
||||
elif isinstance(proxy_config, str):
|
||||
self.proxy_config = ProxyConfig.from_string(proxy_config)
|
||||
else:
|
||||
self.proxy_config = proxy_config # ProxyConfig or None
|
||||
|
||||
self.proxy_rotation_strategy = proxy_rotation_strategy
|
||||
|
||||
@@ -1665,7 +1670,6 @@ class CrawlerRunConfig():
|
||||
|
||||
# Anti-Bot Retry Parameters
|
||||
self.max_retries = max_retries
|
||||
self.fallback_proxy_configs = fallback_proxy_configs or []
|
||||
self.fallback_fetch_function = fallback_fetch_function
|
||||
|
||||
# Compile C4A scripts if provided
|
||||
@@ -1673,6 +1677,14 @@ class CrawlerRunConfig():
|
||||
self._compile_c4a_script()
|
||||
|
||||
|
||||
def _get_proxy_list(self) -> list:
|
||||
"""Normalize proxy_config to a list for the retry loop."""
|
||||
if self.proxy_config is None:
|
||||
return [None]
|
||||
if isinstance(self.proxy_config, list):
|
||||
return self.proxy_config if self.proxy_config else [None]
|
||||
return [self.proxy_config]
|
||||
|
||||
def _compile_c4a_script(self):
|
||||
"""Compile C4A script to JavaScript"""
|
||||
try:
|
||||
@@ -1828,7 +1840,11 @@ class CrawlerRunConfig():
|
||||
"prettiify": self.prettiify,
|
||||
"parser_type": self.parser_type,
|
||||
"scraping_strategy": self.scraping_strategy,
|
||||
"proxy_config": self.proxy_config.to_dict() if hasattr(self.proxy_config, 'to_dict') else self.proxy_config,
|
||||
"proxy_config": (
|
||||
[p.to_dict() if hasattr(p, 'to_dict') else p for p in self.proxy_config]
|
||||
if isinstance(self.proxy_config, list)
|
||||
else (self.proxy_config.to_dict() if hasattr(self.proxy_config, 'to_dict') else self.proxy_config)
|
||||
),
|
||||
"proxy_rotation_strategy": self.proxy_rotation_strategy,
|
||||
"proxy_session_id": self.proxy_session_id,
|
||||
"proxy_session_ttl": self.proxy_session_ttl,
|
||||
@@ -1903,7 +1919,6 @@ class CrawlerRunConfig():
|
||||
"match_mode": self.match_mode,
|
||||
"experimental": self.experimental,
|
||||
"max_retries": self.max_retries,
|
||||
"fallback_proxy_configs": [p.to_dict() for p in self.fallback_proxy_configs] if self.fallback_proxy_configs else [],
|
||||
}
|
||||
|
||||
def clone(self, **kwargs):
|
||||
|
||||
@@ -393,24 +393,26 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
# --- Anti-bot retry setup ---
|
||||
_fallback_proxy = None
|
||||
if (config.proxy_config
|
||||
and getattr(config.proxy_config, "is_fallback", False)):
|
||||
_fallback_proxy = config.proxy_config
|
||||
config.proxy_config = None
|
||||
|
||||
_max_attempts = 1 + getattr(config, "max_retries", 0)
|
||||
_fallback_proxies = getattr(config, "fallback_proxy_configs", None) or []
|
||||
_proxy_activated = False
|
||||
_proxy_list = config._get_proxy_list()
|
||||
_original_proxy_config = config.proxy_config
|
||||
_block_reason = ""
|
||||
_done = False
|
||||
crawl_result = None
|
||||
_crawl_stats = {
|
||||
"attempts": 0,
|
||||
"retries": 0,
|
||||
"proxies_used": [],
|
||||
"fallback_fetch_used": False,
|
||||
"resolved_by": None,
|
||||
}
|
||||
|
||||
for _attempt in range(_max_attempts):
|
||||
if _done:
|
||||
break
|
||||
|
||||
if _attempt > 0:
|
||||
_crawl_stats["retries"] = _attempt
|
||||
self.logger.warning(
|
||||
message="Anti-bot retry {attempt}/{max_retries} for {url} — {reason}",
|
||||
tag="ANTIBOT",
|
||||
@@ -421,38 +423,22 @@ class AsyncWebCrawler:
|
||||
"reason": _block_reason,
|
||||
},
|
||||
)
|
||||
# Activate is_fallback proxy on first retry
|
||||
if _fallback_proxy and not _proxy_activated:
|
||||
config.proxy_config = _fallback_proxy
|
||||
_proxy_activated = True
|
||||
self.logger.info(
|
||||
message="Activating fallback proxy: {proxy}",
|
||||
tag="ANTIBOT",
|
||||
params={"proxy": _fallback_proxy.server},
|
||||
)
|
||||
|
||||
# Build list of proxies to try this round:
|
||||
# current config.proxy_config first, then each fallback proxy
|
||||
_proxies_this_round = [config.proxy_config] # main (may be None)
|
||||
_proxies_this_round.extend(_fallback_proxies)
|
||||
|
||||
for _p_idx, _proxy in enumerate(_proxies_this_round):
|
||||
_is_fallback_proxy = _p_idx > 0
|
||||
if _is_fallback_proxy:
|
||||
for _p_idx, _proxy in enumerate(_proxy_list):
|
||||
if _p_idx > 0 or _attempt > 0:
|
||||
self.logger.info(
|
||||
message="Trying fallback proxy {idx}/{total}: {proxy}",
|
||||
message="Trying proxy {idx}/{total}: {proxy}",
|
||||
tag="ANTIBOT",
|
||||
params={
|
||||
"idx": _p_idx,
|
||||
"total": len(_fallback_proxies),
|
||||
"proxy": _proxy.server,
|
||||
"idx": _p_idx + 1,
|
||||
"total": len(_proxy_list),
|
||||
"proxy": _proxy.server if _proxy else "direct",
|
||||
},
|
||||
)
|
||||
|
||||
# Temporarily set the proxy for this attempt
|
||||
_saved_proxy = config.proxy_config
|
||||
if _is_fallback_proxy:
|
||||
# Set the active proxy for this attempt
|
||||
config.proxy_config = _proxy
|
||||
_crawl_stats["attempts"] += 1
|
||||
|
||||
try:
|
||||
t1 = time.perf_counter()
|
||||
@@ -507,27 +493,38 @@ class AsyncWebCrawler:
|
||||
# Check if blocked
|
||||
_blocked, _block_reason = is_blocked(
|
||||
async_response.status_code, html)
|
||||
|
||||
_crawl_stats["proxies_used"].append({
|
||||
"proxy": _proxy.server if _proxy else None,
|
||||
"status_code": async_response.status_code,
|
||||
"blocked": _blocked,
|
||||
"reason": _block_reason if _blocked else "",
|
||||
})
|
||||
|
||||
if not _blocked:
|
||||
_crawl_stats["resolved_by"] = "proxy" if _proxy else "direct"
|
||||
_done = True
|
||||
break # Success — exit proxy loop
|
||||
|
||||
except Exception as _crawl_err:
|
||||
if _is_fallback_proxy:
|
||||
_crawl_stats["proxies_used"].append({
|
||||
"proxy": _proxy.server if _proxy else None,
|
||||
"status_code": None,
|
||||
"blocked": True,
|
||||
"reason": str(_crawl_err),
|
||||
})
|
||||
if _p_idx > 0 or _attempt > 0:
|
||||
self.logger.error_status(
|
||||
url=url,
|
||||
error=f"Fallback proxy {_proxy.server} failed: {_crawl_err}",
|
||||
error=f"Proxy {_proxy.server if _proxy else 'direct'} failed: {_crawl_err}",
|
||||
tag="ANTIBOT",
|
||||
)
|
||||
_block_reason = str(_crawl_err)
|
||||
else:
|
||||
raise # Let main proxy errors propagate normally
|
||||
finally:
|
||||
if _is_fallback_proxy:
|
||||
config.proxy_config = _saved_proxy
|
||||
raise # First attempt on first proxy propagates normally
|
||||
|
||||
# --- Restore stashed is_fallback proxy for config integrity ---
|
||||
if _fallback_proxy and not _proxy_activated:
|
||||
config.proxy_config = _fallback_proxy
|
||||
# Restore original proxy_config
|
||||
config.proxy_config = _original_proxy_config
|
||||
|
||||
# --- Fallback fetch function (last resort after all retries+proxies exhausted) ---
|
||||
if (crawl_result
|
||||
@@ -540,6 +537,7 @@ class AsyncWebCrawler:
|
||||
tag="ANTIBOT",
|
||||
params={"url": url[:80]},
|
||||
)
|
||||
_crawl_stats["fallback_fetch_used"] = True
|
||||
try:
|
||||
_fallback_html = await config.fallback_fetch_function(url)
|
||||
if _fallback_html:
|
||||
@@ -560,6 +558,7 @@ class AsyncWebCrawler:
|
||||
crawl_result.status_code = 200
|
||||
crawl_result.session_id = getattr(config, "session_id", None)
|
||||
crawl_result.cache_status = "miss"
|
||||
_crawl_stats["resolved_by"] = "fallback_fetch"
|
||||
except Exception as _fallback_err:
|
||||
self.logger.error_status(
|
||||
url=url,
|
||||
@@ -567,6 +566,15 @@ class AsyncWebCrawler:
|
||||
tag="ANTIBOT",
|
||||
)
|
||||
|
||||
# --- Mark blocked results as failed ---
|
||||
if crawl_result:
|
||||
_blocked, _block_reason = is_blocked(
|
||||
crawl_result.status_code, crawl_result.html or "")
|
||||
if _blocked:
|
||||
crawl_result.success = False
|
||||
crawl_result.error_message = f"Blocked by anti-bot protection: {_block_reason}"
|
||||
crawl_result.crawl_stats = _crawl_stats
|
||||
|
||||
# Compute head fingerprint for cache validation
|
||||
if crawl_result and crawl_result.html:
|
||||
head_end = crawl_result.html.lower().find('</head>')
|
||||
|
||||
@@ -157,6 +157,8 @@ class CrawlResult(BaseModel):
|
||||
head_fingerprint: Optional[str] = None
|
||||
cached_at: Optional[float] = None
|
||||
cache_status: Optional[str] = None # "hit", "hit_validated", "hit_fallback", "miss"
|
||||
# Anti-bot retry/proxy usage stats
|
||||
crawl_stats: Optional[Dict[str, Any]] = None
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
|
||||
@@ -13,39 +13,53 @@ After each crawl attempt, Crawl4AI inspects the HTTP status code and HTML conten
|
||||
|
||||
Detection uses structural HTML markers (specific element IDs, script sources, form actions) rather than generic keywords to minimize false positives. A normal page that happens to mention "CAPTCHA" or "Cloudflare" in its content will not be flagged.
|
||||
|
||||
When all attempts fail and blocking is still detected, the result is returned with `success=False` and `error_message` describing the block reason.
|
||||
|
||||
## Configuration Options
|
||||
|
||||
All anti-bot retry options live on `CrawlerRunConfig`:
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
|---|---|---|---|
|
||||
| `proxy_config` | `ProxyConfig`, `list[ProxyConfig]`, or `None` | `None` | Single proxy or ordered list of proxies to try. Each retry round iterates through the full list. |
|
||||
| `max_retries` | `int` | `0` | Number of retry rounds when blocking is detected. `0` = no retries. |
|
||||
| `fallback_proxy_configs` | `list[ProxyConfig]` | `[]` | List of fallback proxies tried in order within each retry round. |
|
||||
| `fallback_fetch_function` | `async (str) -> str` | `None` | Async function called as last resort. Takes URL, returns raw HTML. |
|
||||
|
||||
And on `ProxyConfig`:
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
|---|---|---|---|
|
||||
| `is_fallback` | `bool` | `False` | When `True`, this proxy is skipped on the first attempt and only activated after blocking is detected. |
|
||||
|
||||
## Escalation Chain
|
||||
|
||||
Each retry round tries the main proxy first, then each fallback proxy in order. If all rounds are exhausted and the page is still blocked, the fallback fetch function is called as a last resort.
|
||||
Each retry round tries every proxy in `proxy_config` in order. If all rounds are exhausted and the page is still blocked, the fallback fetch function is called as a last resort.
|
||||
|
||||
```
|
||||
For each round (1 + max_retries rounds):
|
||||
1. Try with main proxy_config (or no proxy if is_fallback=True on first round)
|
||||
2. If blocked → try fallback_proxy_configs[0]
|
||||
3. If blocked → try fallback_proxy_configs[1]
|
||||
4. ... continue through all fallback proxies
|
||||
1. Try proxy_config[0] (or direct if proxy_config is None)
|
||||
2. If blocked → try proxy_config[1]
|
||||
3. If blocked → try proxy_config[2]
|
||||
4. ... continue through all proxies
|
||||
5. If any attempt succeeds → done
|
||||
|
||||
If all rounds exhausted and still blocked:
|
||||
6. Call fallback_fetch_function(url) → process returned HTML
|
||||
```
|
||||
|
||||
Worst-case attempts before the fetch function: `(1 + max_retries) x (1 + len(fallback_proxy_configs))`
|
||||
Worst-case attempts before the fetch function: `(1 + max_retries) x len(proxy_config)`
|
||||
|
||||
## Crawl Stats
|
||||
|
||||
Every crawl result includes a `crawl_stats` dict with detailed attempt tracking:
|
||||
|
||||
```python
|
||||
result.crawl_stats = {
|
||||
"attempts": 3, # total browser attempts made
|
||||
"retries": 1, # retry rounds used (0 = succeeded first round)
|
||||
"proxies_used": [ # ordered list of every attempt
|
||||
{"proxy": None, "status_code": 403, "blocked": True, "reason": "Akamai block (Reference #)"},
|
||||
{"proxy": "proxy.io:8080", "status_code": 403, "blocked": True, "reason": "Akamai block (Reference #)"},
|
||||
{"proxy": "premium.io:9090", "status_code": 200, "blocked": False, "reason": ""},
|
||||
],
|
||||
"fallback_fetch_used": False, # whether fallback_fetch_function was called
|
||||
"resolved_by": "proxy", # "direct" | "proxy" | "fallback_fetch" | null (all failed)
|
||||
}
|
||||
```
|
||||
|
||||
## Usage Examples
|
||||
|
||||
@@ -64,9 +78,9 @@ async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
|
||||
)
|
||||
```
|
||||
|
||||
### Proxy as Fallback Only
|
||||
### Single Proxy
|
||||
|
||||
Use `is_fallback=True` to skip the proxy on the first attempt. If the site doesn't block you, no proxy credits are consumed. If it does, the proxy activates on retry.
|
||||
Pass a single `ProxyConfig` — it's used on every attempt. Same behavior as always.
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import ProxyConfig
|
||||
@@ -77,24 +91,23 @@ config = CrawlerRunConfig(
|
||||
server="http://proxy.example.com:8080",
|
||||
username="user",
|
||||
password="pass",
|
||||
is_fallback=True, # Only used when blocking is detected
|
||||
),
|
||||
)
|
||||
```
|
||||
|
||||
### Fallback Proxy List
|
||||
### Proxy List (Escalation)
|
||||
|
||||
Try a cheaper proxy first, escalate to a premium proxy if it fails. Both are tried within each retry round.
|
||||
Pass a list of proxies. They're tried in order — first one that works wins. Within each retry round, the entire list is tried again.
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
max_retries=2,
|
||||
proxy_config=ProxyConfig(
|
||||
max_retries=1,
|
||||
proxy_config=[
|
||||
ProxyConfig(
|
||||
server="http://datacenter-proxy.example.com:8080",
|
||||
username="user",
|
||||
password="pass",
|
||||
),
|
||||
fallback_proxy_configs=[
|
||||
ProxyConfig(
|
||||
server="http://residential-proxy.example.com:9090",
|
||||
username="user",
|
||||
@@ -104,7 +117,7 @@ config = CrawlerRunConfig(
|
||||
)
|
||||
```
|
||||
|
||||
With this setup, each round tries the datacenter proxy first, then the residential proxy. With `max_retries=2`, worst case is 3 rounds x 2 proxies = 6 attempts.
|
||||
With this setup, each round tries the datacenter proxy first, then the residential proxy. With `max_retries=1`, worst case is 2 rounds x 2 proxies = 4 attempts.
|
||||
|
||||
### Fallback Fetch Function
|
||||
|
||||
@@ -137,7 +150,7 @@ The function can do anything — call an API, read from a database, return cache
|
||||
|
||||
### Full Escalation (All Features Combined)
|
||||
|
||||
This example combines every layer: stealth mode, a fallback proxy that only activates when blocked, a list of escalation proxies tried each round, retries, and a final fetch function.
|
||||
This example combines every layer: stealth mode, a list of proxies tried in order, retries, and a final fetch function.
|
||||
|
||||
```python
|
||||
import aiohttp
|
||||
@@ -164,16 +177,13 @@ crawl_config = CrawlerRunConfig(
|
||||
wait_until="load",
|
||||
max_retries=2,
|
||||
|
||||
# Primary proxy — is_fallback=True means first attempt runs without it
|
||||
proxy_config=ProxyConfig(
|
||||
# Proxies tried in order — cheapest first
|
||||
proxy_config=[
|
||||
ProxyConfig(
|
||||
server="http://datacenter-proxy.example.com:8080",
|
||||
username="user",
|
||||
password="pass",
|
||||
is_fallback=True,
|
||||
),
|
||||
|
||||
# Fallback proxies — tried in order after main proxy fails each round
|
||||
fallback_proxy_configs=[
|
||||
ProxyConfig(
|
||||
server="http://residential-proxy.example.com:9090",
|
||||
username="user",
|
||||
@@ -193,6 +203,8 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
|
||||
if result.success:
|
||||
print(f"Got {len(result.markdown.raw_markdown)} chars of markdown")
|
||||
print(f"Resolved by: {result.crawl_stats['resolved_by']}")
|
||||
print(f"Attempts: {result.crawl_stats['attempts']}")
|
||||
else:
|
||||
print(f"All attempts failed: {result.error_message}")
|
||||
```
|
||||
@@ -201,12 +213,12 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
|
||||
| Round | Attempt | What runs |
|
||||
|---|---|---|
|
||||
| 1 | 1 | No proxy (is_fallback skips it) — blocked |
|
||||
| 1 | 2 | Residential fallback proxy — blocked (bad IP) |
|
||||
| 2 | 1 | Datacenter proxy activated — blocked |
|
||||
| 2 | 2 | Residential fallback proxy — blocked |
|
||||
| 1 | 1 | Datacenter proxy — blocked |
|
||||
| 1 | 2 | Residential proxy — blocked |
|
||||
| 2 | 1 | Datacenter proxy — blocked |
|
||||
| 2 | 2 | Residential proxy — blocked |
|
||||
| 3 | 1 | Datacenter proxy — blocked |
|
||||
| 3 | 2 | Residential fallback proxy — blocked |
|
||||
| 3 | 2 | Residential proxy — blocked |
|
||||
| - | - | `external_fetch(url)` called — returns HTML |
|
||||
|
||||
That's up to 6 browser attempts + 1 function call before giving up.
|
||||
@@ -214,19 +226,10 @@ That's up to 6 browser attempts + 1 function call before giving up.
|
||||
## Tips
|
||||
|
||||
- **Start with `max_retries=0`** and a `fallback_fetch_function` if you just want a safety net without burning time on retries.
|
||||
- **Use `is_fallback=True`** on your proxy to avoid consuming proxy credits on sites that don't need them.
|
||||
- **Order fallback proxies cheapest-first** — datacenter proxies before residential, residential before premium.
|
||||
- **Order proxies cheapest-first** — datacenter proxies before residential, residential before premium.
|
||||
- **Combine with stealth mode** — `BrowserConfig(enable_stealth=True)` and `CrawlerRunConfig(magic=True)` reduce the chance of being blocked in the first place.
|
||||
- **`wait_until="load"`** is important for anti-bot sites — the default `domcontentloaded` can return before the anti-bot sensor finishes.
|
||||
- **You don't need a primary proxy to use fallback proxies.** If you skip `proxy_config` and only pass `fallback_proxy_configs`, the first attempt each round runs with no proxy. This is useful when you want to try direct access first and only escalate to proxies if blocked:
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
max_retries=1,
|
||||
fallback_proxy_configs=[proxy_A, proxy_B],
|
||||
)
|
||||
# Round 1: no proxy → proxy_A → proxy_B
|
||||
# Round 2: no proxy → proxy_A → proxy_B
|
||||
```
|
||||
- **Check `crawl_stats`** to understand what happened — how many attempts, which proxy worked, whether the fallback function was needed.
|
||||
|
||||
## See Also
|
||||
|
||||
|
||||
@@ -109,10 +109,9 @@ We group them by category.
|
||||
| **`timezone_id`** | `str or None` (None) | Browser's timezone (e.g., "America/New_York", "Europe/Paris"). |
|
||||
| **`geolocation`** | `GeolocationConfig or None` (None) | GPS coordinates configuration. Use `GeolocationConfig(latitude=..., longitude=..., accuracy=...)`. |
|
||||
| **`fetch_ssl_certificate`** | `bool` (False) | If `True`, fetches and includes SSL certificate information in the result. |
|
||||
| **`proxy_config`** | `ProxyConfig or dict or None` (None) | Proxy configuration for this specific crawl. Can override browser-level proxy settings. Set `is_fallback=True` on the ProxyConfig to only use the proxy when anti-bot blocking is detected. |
|
||||
| **`proxy_config`** | `ProxyConfig`, `list[ProxyConfig]`, or `None` (None) | Proxy configuration for this specific crawl. Pass a single proxy or an ordered list of proxies to try. See [Anti-Bot & Fallback](../advanced/anti-bot-and-fallback.md). |
|
||||
| **`proxy_rotation_strategy`** | `ProxyRotationStrategy` (None) | Strategy for rotating proxies during crawl operations. |
|
||||
| **`max_retries`** | `int` (0) | Number of retry rounds when anti-bot blocking is detected. Each round tries the main proxy and all fallback proxies. |
|
||||
| **`fallback_proxy_configs`** | `list[ProxyConfig]` ([]) | List of fallback proxies tried in order within each retry round after the main proxy fails. |
|
||||
| **`max_retries`** | `int` (0) | Number of retry rounds when anti-bot blocking is detected. Each round tries all proxies in `proxy_config`. |
|
||||
| **`fallback_fetch_function`**| `async (str) -> str or None` (None) | Async function called as last resort after all retries are exhausted. Takes URL, returns raw HTML. See [Anti-Bot & Fallback](../advanced/anti-bot-and-fallback.md). |
|
||||
|
||||
---
|
||||
|
||||
@@ -276,12 +276,11 @@ class CrawlerRunConfig:
|
||||
- See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control)
|
||||
|
||||
10.⠀**Proxy Configuration**:
|
||||
- **`proxy_config`**: Proxy server configuration (ProxyConfig object or dict) e.g. {"server": "...", "username": "...", "password"}. Set `is_fallback=True` to only use the proxy when anti-bot blocking is detected.
|
||||
- **`proxy_config`**: Single `ProxyConfig` or `list[ProxyConfig]` — proxies tried in order. Pass a list for automatic escalation.
|
||||
- **`proxy_rotation_strategy`**: Strategy for rotating proxies during crawls
|
||||
|
||||
11.⠀**Anti-Bot Retry & Fallback** (see [Anti-Bot & Fallback](../advanced/anti-bot-and-fallback.md)):
|
||||
- **`max_retries`**: Number of retry rounds when blocking is detected (default: 0)
|
||||
- **`fallback_proxy_configs`**: List of fallback proxies tried in order within each retry round
|
||||
- **`max_retries`**: Number of retry rounds when blocking is detected (default: 0). Each round tries all proxies in `proxy_config`.
|
||||
- **`fallback_fetch_function`**: Async function called as last resort — takes URL, returns raw HTML
|
||||
|
||||
12.⠀**Page Interaction Parameters**:
|
||||
|
||||
Reference in New Issue
Block a user