From ccd24aa82456cde4427c6e8e479b3354afc7dfc3 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 15 Feb 2026 10:55:00 +0000 Subject: [PATCH] Fix fallback fetch: run when all proxies crash, skip re-check, never return None MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related fixes to the anti-bot proxy retry + fallback pipeline: 1. Allow fallback_fetch_function to run when crawl_result is None (all proxies threw exceptions like browser crashes). Previously fallback only ran when crawl_result existed but was blocked — exception-only failures bypassed it. 2. Skip is_blocked() re-check after successful fallback. Real unblocked pages may contain anti-bot script markers (e.g. PerimeterX JS on Walmart) that trigger false positives, overriding success=True back to False. 3. Always return a CrawlResult with crawl_stats, never None. When all proxies and fallback fail, create a minimal failed result so callers get stats about what was attempted instead of AttributeError on None. Also: if aprocess_html fails during fallback (dead browser can't run Page.evaluate for consent popup removal), fall back to raw HTML result instead of silently discarding the successfully-fetched fallback content. --- crawl4ai/async_webcrawler.py | 85 ++++++++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 24 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 9dbf98ef..313a4196 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -529,11 +529,15 @@ class AsyncWebCrawler: config.proxy_config = _original_proxy_config # --- Fallback fetch function (last resort after all retries+proxies exhausted) --- - if (crawl_result - and getattr(config, "fallback_fetch_function", None)): - _blocked, _ = is_blocked( - crawl_result.status_code, crawl_result.html or "") - if _blocked: + # Invoke fallback when: (a) crawl_result exists but is blocked, OR + # (b) crawl_result is None because all proxies threw exceptions (browser crash, timeout). + _fallback_fn = getattr(config, "fallback_fetch_function", None) + if _fallback_fn and not _done: + _needs_fallback = ( + crawl_result is None # All proxies threw exceptions + or is_blocked(crawl_result.status_code, crawl_result.html or "")[0] + ) + if _needs_fallback: self.logger.warning( message="All retries exhausted, invoking fallback_fetch_function for {url}", tag="ANTIBOT", @@ -541,21 +545,38 @@ class AsyncWebCrawler: ) _crawl_stats["fallback_fetch_used"] = True try: - _fallback_html = await config.fallback_fetch_function(url) + _fallback_html = await _fallback_fn(url) if _fallback_html: - crawl_result = await self.aprocess_html( - url=url, - html=sanitize_input_encode(_fallback_html), - extracted_content=extracted_content, - config=config, - screenshot_data=None, - pdf_data=None, - verbose=config.verbose, - is_raw_html=True, - redirected_url=url, - original_scheme=urlparse(url).scheme, - **kwargs, - ) + _sanitized_html = sanitize_input_encode(_fallback_html) + try: + crawl_result = await self.aprocess_html( + url=url, + html=_sanitized_html, + extracted_content=extracted_content, + config=config, + screenshot_data=None, + pdf_data=None, + verbose=config.verbose, + is_raw_html=True, + redirected_url=url, + original_scheme=urlparse(url).scheme, + **kwargs, + ) + except Exception as _proc_err: + # aprocess_html may fail if browser is dead (e.g., + # consent popup removal needs Page.evaluate). + # Fall back to a minimal result with raw HTML. + self.logger.warning( + message="Fallback HTML processing failed ({err}), using raw HTML", + tag="ANTIBOT", + params={"err": str(_proc_err)[:100]}, + ) + crawl_result = CrawlResult( + url=url, + html=_sanitized_html, + success=True, + status_code=200, + ) crawl_result.success = True crawl_result.status_code = 200 crawl_result.session_id = getattr(config, "session_id", None) @@ -569,12 +590,28 @@ class AsyncWebCrawler: ) # --- Mark blocked results as failed --- + # Skip re-check when fallback was used — the fallback result is + # authoritative. Real pages may contain anti-bot script markers + # (e.g. PerimeterX JS on Walmart) that trigger false positives. if crawl_result: - _blocked, _block_reason = is_blocked( - crawl_result.status_code, crawl_result.html or "") - if _blocked: - crawl_result.success = False - crawl_result.error_message = f"Blocked by anti-bot protection: {_block_reason}" + if not _crawl_stats.get("fallback_fetch_used"): + _blocked, _block_reason = is_blocked( + crawl_result.status_code, crawl_result.html or "") + if _blocked: + crawl_result.success = False + crawl_result.error_message = f"Blocked by anti-bot protection: {_block_reason}" + crawl_result.crawl_stats = _crawl_stats + else: + # All proxies threw exceptions and fallback either wasn't + # configured or also failed. Build a minimal result so the + # caller gets crawl_stats instead of None. + crawl_result = CrawlResult( + url=url, + html="", + success=False, + status_code=None, + error_message=f"All proxies failed: {_block_reason}" if _block_reason else "All proxies failed", + ) crawl_result.crawl_stats = _crawl_stats # Compute head fingerprint for cache validation