From 8df3541ac44c5c217c6c4bb01c00bf7818b47b7e Mon Sep 17 00:00:00 2001 From: unclecode Date: Thu, 19 Feb 2026 14:05:56 +0000 Subject: [PATCH] Skip anti-bot checks and fallback for raw: URLs raw: URLs contain caller-provided HTML (e.g. from cache), not content fetched from a web server. Anti-bot detection, proxy retries, and fallback fetching are meaningless for this content. - Skip is_blocked() in retry loop and final re-check for raw: URLs - Skip fallback_fetch_function invocation for raw: URLs - Add RAW_DEBUG logging in browser strategy for set_content/page.content --- crawl4ai/async_crawler_strategy.py | 26 ++++++++++++++++++++++++++ crawl4ai/async_webcrawler.py | 22 +++++++++++++++++----- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 7e5299d8..576c6319 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -476,6 +476,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if needs_browser: # Route through _crawl_web() for full browser pipeline # _crawl_web() will detect file:// and raw: URLs and use set_content() + if url.startswith("raw"): + _raw_len = len(url) - (6 if url.startswith("raw://") else 4) + _triggers = [k for k in ("process_in_browser", "screenshot", "pdf", + "capture_mhtml", "js_code", "wait_for", "scan_full_page", + "remove_overlay_elements", "remove_consent_popups", + "simulate_user", "magic", "process_iframes", + "capture_console_messages", "capture_network_requests") + if getattr(config, k, None)] + self.logger.info( + message="raw: URL ({len} bytes) routed to browser — triggers: {triggers}", + tag="RAW_DEBUG", + params={"len": _raw_len, "triggers": _triggers}, + ) return await self._crawl_web(url, config) # Fast path: return HTML directly without browser interaction @@ -720,6 +733,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # raw:// or raw: html_content = url[6:] if url.startswith("raw://") else url[4:] + self.logger.info( + message="set_content: input_len={input_len}, wait_until={wait_until}", + tag="RAW_DEBUG", + params={"input_len": len(html_content), "wait_until": config.wait_until}, + ) await page.set_content(html_content, wait_until=config.wait_until) response = None # For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string @@ -1053,6 +1071,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: html = await page.content() + # Debug: log output length for raw: URLs to diagnose empty DOM issues + if is_local_content: + self.logger.info( + message="page.content: output_len={output_len}", + tag="RAW_DEBUG", + params={"output_len": len(html) if html else 0}, + ) + await self.execute_hook( "before_return_html", page=page, html=html, context=context, config=config ) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 313a4196..8ab9bb49 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -393,6 +393,11 @@ class AsyncWebCrawler: ) # --- Anti-bot retry setup --- + # raw: URLs contain caller-provided HTML (e.g. from cache), + # not content fetched from a web server. Anti-bot detection, + # proxy retries, and fallback fetching are meaningless here. + _is_raw_url = url.startswith("raw:") or url.startswith("raw://") + _max_attempts = 1 + getattr(config, "max_retries", 0) _proxy_list = config._get_proxy_list() _original_proxy_config = config.proxy_config @@ -490,9 +495,14 @@ class AsyncWebCrawler: crawl_result.session_id = getattr(config, "session_id", None) crawl_result.cache_status = "miss" - # Check if blocked - _blocked, _block_reason = is_blocked( - async_response.status_code, html) + # Check if blocked (skip for raw: URLs — + # caller-provided content, anti-bot N/A) + if _is_raw_url: + _blocked = False + _block_reason = "" + else: + _blocked, _block_reason = is_blocked( + async_response.status_code, html) _crawl_stats["proxies_used"].append({ "proxy": _proxy.server if _proxy else None, @@ -531,8 +541,9 @@ class AsyncWebCrawler: # --- Fallback fetch function (last resort after all retries+proxies exhausted) --- # Invoke fallback when: (a) crawl_result exists but is blocked, OR # (b) crawl_result is None because all proxies threw exceptions (browser crash, timeout). + # Skip for raw: URLs — fallback expects a real URL, not raw HTML content. _fallback_fn = getattr(config, "fallback_fetch_function", None) - if _fallback_fn and not _done: + if _fallback_fn and not _done and not _is_raw_url: _needs_fallback = ( crawl_result is None # All proxies threw exceptions or is_blocked(crawl_result.status_code, crawl_result.html or "")[0] @@ -593,8 +604,9 @@ class AsyncWebCrawler: # Skip re-check when fallback was used — the fallback result is # authoritative. Real pages may contain anti-bot script markers # (e.g. PerimeterX JS on Walmart) that trigger false positives. + # Also skip for raw: URLs — caller-provided content, anti-bot N/A. if crawl_result: - if not _crawl_stats.get("fallback_fetch_used"): + if not _crawl_stats.get("fallback_fetch_used") and not _is_raw_url: _blocked, _block_reason = is_blocked( crawl_result.status_code, crawl_result.html or "") if _blocked: