Fix fallback fetch: run when all proxies crash, skip re-check, never return None

Three related fixes to the anti-bot proxy retry + fallback pipeline: 1. Allow fallback_fetch_function to run when crawl_result is None (all proxies threw exceptions like browser crashes). Previously fallback only ran when crawl_result existed but was blocked — exception-only failures bypassed it. 2. Skip is_blocked() re-check after successful fallback. Real unblocked pages may contain anti-bot script markers (e.g. PerimeterX JS on Walmart) that trigger false positives, overriding success=True back to False. 3. Always return a CrawlResult with crawl_stats, never None. When all proxies and fallback fail, create a minimal failed result so callers get stats about what was attempted instead of AttributeError on None. Also: if aprocess_html fails during fallback (dead browser can't run Page.evaluate for consent popup removal), fall back to raw HTML result instead of silently discarding the successfully-fetched fallback content.
2026-02-15 10:55:00 +00:00
parent 45d8e1450f
commit ccd24aa824
1 changed files with 61 additions and 24 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -529,11 +529,15 @@ class AsyncWebCrawler:
                    config.proxy_config = _original_proxy_config
                    # --- Fallback fetch function (last resort after all retries+proxies exhausted) ---
-                    if (crawl_result
+                    # Invoke fallback when: (a) crawl_result exists but is blocked, OR
-                            and getattr(config, "fallback_fetch_function", None)):
+                    # (b) crawl_result is None because all proxies threw exceptions (browser crash, timeout).
-                        _blocked, _ = is_blocked(
+                    _fallback_fn = getattr(config, "fallback_fetch_function", None)
-                            crawl_result.status_code, crawl_result.html or "")
+                    if _fallback_fn and not _done:
-                        if _blocked:
+                        _needs_fallback = (
                            crawl_result is None  # All proxies threw exceptions
                            or is_blocked(crawl_result.status_code, crawl_result.html or "")[0]
                        )
                        if _needs_fallback:
                            self.logger.warning(
                                message="All retries exhausted, invoking fallback_fetch_function for {url}",
                                tag="ANTIBOT",
@@ -541,11 +545,13 @@ class AsyncWebCrawler:
                            )
                            _crawl_stats["fallback_fetch_used"] = True
                            try:
-                                _fallback_html = await config.fallback_fetch_function(url)
+                                _fallback_html = await _fallback_fn(url)
                                if _fallback_html:
                                    _sanitized_html = sanitize_input_encode(_fallback_html)
                                    try:
                                        crawl_result = await self.aprocess_html(
                                            url=url,
-                                        html=sanitize_input_encode(_fallback_html),
+                                            html=_sanitized_html,
                                            extracted_content=extracted_content,
                                            config=config,
                                            screenshot_data=None,
@@ -556,6 +562,21 @@ class AsyncWebCrawler:
                                            original_scheme=urlparse(url).scheme,
                                            **kwargs,
                                        )
                                    except Exception as _proc_err:
                                        # aprocess_html may fail if browser is dead (e.g.,
                                        # consent popup removal needs Page.evaluate).
                                        # Fall back to a minimal result with raw HTML.
                                        self.logger.warning(
                                            message="Fallback HTML processing failed ({err}), using raw HTML",
                                            tag="ANTIBOT",
                                            params={"err": str(_proc_err)[:100]},
                                        )
                                        crawl_result = CrawlResult(
                                            url=url,
                                            html=_sanitized_html,
                                            success=True,
                                            status_code=200,
                                        )
                                    crawl_result.success = True
                                    crawl_result.status_code = 200
                                    crawl_result.session_id = getattr(config, "session_id", None)
@@ -569,13 +590,29 @@ class AsyncWebCrawler:
                                )
                    # --- Mark blocked results as failed ---
                    # Skip re-check when fallback was used — the fallback result is
                    # authoritative.  Real pages may contain anti-bot script markers
                    # (e.g. PerimeterX JS on Walmart) that trigger false positives.
                    if crawl_result:
                        if not _crawl_stats.get("fallback_fetch_used"):
                            _blocked, _block_reason = is_blocked(
                                crawl_result.status_code, crawl_result.html or "")
                            if _blocked:
                                crawl_result.success = False
                                crawl_result.error_message = f"Blocked by anti-bot protection: {_block_reason}"
                        crawl_result.crawl_stats = _crawl_stats
                    else:
                        # All proxies threw exceptions and fallback either wasn't
                        # configured or also failed.  Build a minimal result so the
                        # caller gets crawl_stats instead of None.
                        crawl_result = CrawlResult(
                            url=url,
                            html="",
                            success=False,
                            status_code=None,
                            error_message=f"All proxies failed: {_block_reason}" if _block_reason else "All proxies failed",
                        )
                        crawl_result.crawl_stats = _crawl_stats
                    # Compute head fingerprint for cache validation
                    if crawl_result and crawl_result.html: