Fix fallback fetch: run when all proxies crash, skip re-check, never return None
Three related fixes to the anti-bot proxy retry + fallback pipeline: 1. Allow fallback_fetch_function to run when crawl_result is None (all proxies threw exceptions like browser crashes). Previously fallback only ran when crawl_result existed but was blocked — exception-only failures bypassed it. 2. Skip is_blocked() re-check after successful fallback. Real unblocked pages may contain anti-bot script markers (e.g. PerimeterX JS on Walmart) that trigger false positives, overriding success=True back to False. 3. Always return a CrawlResult with crawl_stats, never None. When all proxies and fallback fail, create a minimal failed result so callers get stats about what was attempted instead of AttributeError on None. Also: if aprocess_html fails during fallback (dead browser can't run Page.evaluate for consent popup removal), fall back to raw HTML result instead of silently discarding the successfully-fetched fallback content.
This commit is contained in:
@@ -529,11 +529,15 @@ class AsyncWebCrawler:
|
|||||||
config.proxy_config = _original_proxy_config
|
config.proxy_config = _original_proxy_config
|
||||||
|
|
||||||
# --- Fallback fetch function (last resort after all retries+proxies exhausted) ---
|
# --- Fallback fetch function (last resort after all retries+proxies exhausted) ---
|
||||||
if (crawl_result
|
# Invoke fallback when: (a) crawl_result exists but is blocked, OR
|
||||||
and getattr(config, "fallback_fetch_function", None)):
|
# (b) crawl_result is None because all proxies threw exceptions (browser crash, timeout).
|
||||||
_blocked, _ = is_blocked(
|
_fallback_fn = getattr(config, "fallback_fetch_function", None)
|
||||||
crawl_result.status_code, crawl_result.html or "")
|
if _fallback_fn and not _done:
|
||||||
if _blocked:
|
_needs_fallback = (
|
||||||
|
crawl_result is None # All proxies threw exceptions
|
||||||
|
or is_blocked(crawl_result.status_code, crawl_result.html or "")[0]
|
||||||
|
)
|
||||||
|
if _needs_fallback:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
message="All retries exhausted, invoking fallback_fetch_function for {url}",
|
message="All retries exhausted, invoking fallback_fetch_function for {url}",
|
||||||
tag="ANTIBOT",
|
tag="ANTIBOT",
|
||||||
@@ -541,11 +545,13 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
_crawl_stats["fallback_fetch_used"] = True
|
_crawl_stats["fallback_fetch_used"] = True
|
||||||
try:
|
try:
|
||||||
_fallback_html = await config.fallback_fetch_function(url)
|
_fallback_html = await _fallback_fn(url)
|
||||||
if _fallback_html:
|
if _fallback_html:
|
||||||
|
_sanitized_html = sanitize_input_encode(_fallback_html)
|
||||||
|
try:
|
||||||
crawl_result = await self.aprocess_html(
|
crawl_result = await self.aprocess_html(
|
||||||
url=url,
|
url=url,
|
||||||
html=sanitize_input_encode(_fallback_html),
|
html=_sanitized_html,
|
||||||
extracted_content=extracted_content,
|
extracted_content=extracted_content,
|
||||||
config=config,
|
config=config,
|
||||||
screenshot_data=None,
|
screenshot_data=None,
|
||||||
@@ -556,6 +562,21 @@ class AsyncWebCrawler:
|
|||||||
original_scheme=urlparse(url).scheme,
|
original_scheme=urlparse(url).scheme,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
except Exception as _proc_err:
|
||||||
|
# aprocess_html may fail if browser is dead (e.g.,
|
||||||
|
# consent popup removal needs Page.evaluate).
|
||||||
|
# Fall back to a minimal result with raw HTML.
|
||||||
|
self.logger.warning(
|
||||||
|
message="Fallback HTML processing failed ({err}), using raw HTML",
|
||||||
|
tag="ANTIBOT",
|
||||||
|
params={"err": str(_proc_err)[:100]},
|
||||||
|
)
|
||||||
|
crawl_result = CrawlResult(
|
||||||
|
url=url,
|
||||||
|
html=_sanitized_html,
|
||||||
|
success=True,
|
||||||
|
status_code=200,
|
||||||
|
)
|
||||||
crawl_result.success = True
|
crawl_result.success = True
|
||||||
crawl_result.status_code = 200
|
crawl_result.status_code = 200
|
||||||
crawl_result.session_id = getattr(config, "session_id", None)
|
crawl_result.session_id = getattr(config, "session_id", None)
|
||||||
@@ -569,13 +590,29 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# --- Mark blocked results as failed ---
|
# --- Mark blocked results as failed ---
|
||||||
|
# Skip re-check when fallback was used — the fallback result is
|
||||||
|
# authoritative. Real pages may contain anti-bot script markers
|
||||||
|
# (e.g. PerimeterX JS on Walmart) that trigger false positives.
|
||||||
if crawl_result:
|
if crawl_result:
|
||||||
|
if not _crawl_stats.get("fallback_fetch_used"):
|
||||||
_blocked, _block_reason = is_blocked(
|
_blocked, _block_reason = is_blocked(
|
||||||
crawl_result.status_code, crawl_result.html or "")
|
crawl_result.status_code, crawl_result.html or "")
|
||||||
if _blocked:
|
if _blocked:
|
||||||
crawl_result.success = False
|
crawl_result.success = False
|
||||||
crawl_result.error_message = f"Blocked by anti-bot protection: {_block_reason}"
|
crawl_result.error_message = f"Blocked by anti-bot protection: {_block_reason}"
|
||||||
crawl_result.crawl_stats = _crawl_stats
|
crawl_result.crawl_stats = _crawl_stats
|
||||||
|
else:
|
||||||
|
# All proxies threw exceptions and fallback either wasn't
|
||||||
|
# configured or also failed. Build a minimal result so the
|
||||||
|
# caller gets crawl_stats instead of None.
|
||||||
|
crawl_result = CrawlResult(
|
||||||
|
url=url,
|
||||||
|
html="",
|
||||||
|
success=False,
|
||||||
|
status_code=None,
|
||||||
|
error_message=f"All proxies failed: {_block_reason}" if _block_reason else "All proxies failed",
|
||||||
|
)
|
||||||
|
crawl_result.crawl_stats = _crawl_stats
|
||||||
|
|
||||||
# Compute head fingerprint for cache validation
|
# Compute head fingerprint for cache validation
|
||||||
if crawl_result and crawl_result.html:
|
if crawl_result and crawl_result.html:
|
||||||
|
|||||||
Reference in New Issue
Block a user