Fix fallback fetch: run when all proxies crash, skip re-check, never return None

Three related fixes to the anti-bot proxy retry + fallback pipeline:

1. Allow fallback_fetch_function to run when crawl_result is None (all proxies
   threw exceptions like browser crashes). Previously fallback only ran when
   crawl_result existed but was blocked — exception-only failures bypassed it.

2. Skip is_blocked() re-check after successful fallback. Real unblocked pages
   may contain anti-bot script markers (e.g. PerimeterX JS on Walmart) that
   trigger false positives, overriding success=True back to False.

3. Always return a CrawlResult with crawl_stats, never None. When all proxies
   and fallback fail, create a minimal failed result so callers get stats
   about what was attempted instead of AttributeError on None.

Also: if aprocess_html fails during fallback (dead browser can't run
Page.evaluate for consent popup removal), fall back to raw HTML result
instead of silently discarding the successfully-fetched fallback content.
This commit is contained in:
unclecode
2026-02-15 10:55:00 +00:00
parent 45d8e1450f
commit ccd24aa824

View File

@@ -529,11 +529,15 @@ class AsyncWebCrawler:
config.proxy_config = _original_proxy_config config.proxy_config = _original_proxy_config
# --- Fallback fetch function (last resort after all retries+proxies exhausted) --- # --- Fallback fetch function (last resort after all retries+proxies exhausted) ---
if (crawl_result # Invoke fallback when: (a) crawl_result exists but is blocked, OR
and getattr(config, "fallback_fetch_function", None)): # (b) crawl_result is None because all proxies threw exceptions (browser crash, timeout).
_blocked, _ = is_blocked( _fallback_fn = getattr(config, "fallback_fetch_function", None)
crawl_result.status_code, crawl_result.html or "") if _fallback_fn and not _done:
if _blocked: _needs_fallback = (
crawl_result is None # All proxies threw exceptions
or is_blocked(crawl_result.status_code, crawl_result.html or "")[0]
)
if _needs_fallback:
self.logger.warning( self.logger.warning(
message="All retries exhausted, invoking fallback_fetch_function for {url}", message="All retries exhausted, invoking fallback_fetch_function for {url}",
tag="ANTIBOT", tag="ANTIBOT",
@@ -541,11 +545,13 @@ class AsyncWebCrawler:
) )
_crawl_stats["fallback_fetch_used"] = True _crawl_stats["fallback_fetch_used"] = True
try: try:
_fallback_html = await config.fallback_fetch_function(url) _fallback_html = await _fallback_fn(url)
if _fallback_html: if _fallback_html:
_sanitized_html = sanitize_input_encode(_fallback_html)
try:
crawl_result = await self.aprocess_html( crawl_result = await self.aprocess_html(
url=url, url=url,
html=sanitize_input_encode(_fallback_html), html=_sanitized_html,
extracted_content=extracted_content, extracted_content=extracted_content,
config=config, config=config,
screenshot_data=None, screenshot_data=None,
@@ -556,6 +562,21 @@ class AsyncWebCrawler:
original_scheme=urlparse(url).scheme, original_scheme=urlparse(url).scheme,
**kwargs, **kwargs,
) )
except Exception as _proc_err:
# aprocess_html may fail if browser is dead (e.g.,
# consent popup removal needs Page.evaluate).
# Fall back to a minimal result with raw HTML.
self.logger.warning(
message="Fallback HTML processing failed ({err}), using raw HTML",
tag="ANTIBOT",
params={"err": str(_proc_err)[:100]},
)
crawl_result = CrawlResult(
url=url,
html=_sanitized_html,
success=True,
status_code=200,
)
crawl_result.success = True crawl_result.success = True
crawl_result.status_code = 200 crawl_result.status_code = 200
crawl_result.session_id = getattr(config, "session_id", None) crawl_result.session_id = getattr(config, "session_id", None)
@@ -569,13 +590,29 @@ class AsyncWebCrawler:
) )
# --- Mark blocked results as failed --- # --- Mark blocked results as failed ---
# Skip re-check when fallback was used — the fallback result is
# authoritative. Real pages may contain anti-bot script markers
# (e.g. PerimeterX JS on Walmart) that trigger false positives.
if crawl_result: if crawl_result:
if not _crawl_stats.get("fallback_fetch_used"):
_blocked, _block_reason = is_blocked( _blocked, _block_reason = is_blocked(
crawl_result.status_code, crawl_result.html or "") crawl_result.status_code, crawl_result.html or "")
if _blocked: if _blocked:
crawl_result.success = False crawl_result.success = False
crawl_result.error_message = f"Blocked by anti-bot protection: {_block_reason}" crawl_result.error_message = f"Blocked by anti-bot protection: {_block_reason}"
crawl_result.crawl_stats = _crawl_stats crawl_result.crawl_stats = _crawl_stats
else:
# All proxies threw exceptions and fallback either wasn't
# configured or also failed. Build a minimal result so the
# caller gets crawl_stats instead of None.
crawl_result = CrawlResult(
url=url,
html="",
success=False,
status_code=None,
error_message=f"All proxies failed: {_block_reason}" if _block_reason else "All proxies failed",
)
crawl_result.crawl_stats = _crawl_stats
# Compute head fingerprint for cache validation # Compute head fingerprint for cache validation
if crawl_result and crawl_result.html: if crawl_result and crawl_result.html: