From 8df3541ac44c5c217c6c4bb01c00bf7818b47b7e Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Thu, 19 Feb 2026 14:05:56 +0000
Subject: [PATCH] Skip anti-bot checks and fallback for raw: URLs

raw: URLs contain caller-provided HTML (e.g. from cache), not content
fetched from a web server. Anti-bot detection, proxy retries, and
fallback fetching are meaningless for this content.

- Skip is_blocked() in retry loop and final re-check for raw: URLs
- Skip fallback_fetch_function invocation for raw: URLs
- Add RAW_DEBUG logging in browser strategy for set_content/page.content
---
 crawl4ai/async_crawler_strategy.py | 26 ++++++++++++++++++++++++++
 crawl4ai/async_webcrawler.py       | 22 +++++++++++++++++-----
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 7e5299d8..576c6319 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -476,6 +476,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             if needs_browser:
                 # Route through _crawl_web() for full browser pipeline
                 # _crawl_web() will detect file:// and raw: URLs and use set_content()
+                if url.startswith("raw"):
+                    _raw_len = len(url) - (6 if url.startswith("raw://") else 4)
+                    _triggers = [k for k in ("process_in_browser", "screenshot", "pdf",
+                        "capture_mhtml", "js_code", "wait_for", "scan_full_page",
+                        "remove_overlay_elements", "remove_consent_popups",
+                        "simulate_user", "magic", "process_iframes",
+                        "capture_console_messages", "capture_network_requests")
+                        if getattr(config, k, None)]
+                    self.logger.info(
+                        message="raw: URL ({len} bytes) routed to browser — triggers: {triggers}",
+                        tag="RAW_DEBUG",
+                        params={"len": _raw_len, "triggers": _triggers},
+                    )
                 return await self._crawl_web(url, config)
 
             # Fast path: return HTML directly without browser interaction
@@ -720,6 +733,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                         # raw:// or raw:
                         html_content = url[6:] if url.startswith("raw://") else url[4:]
 
+                    self.logger.info(
+                        message="set_content: input_len={input_len}, wait_until={wait_until}",
+                        tag="RAW_DEBUG",
+                        params={"input_len": len(html_content), "wait_until": config.wait_until},
+                    )
                     await page.set_content(html_content, wait_until=config.wait_until)
                     response = None
                     # For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string
@@ -1053,6 +1071,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             else:
                 html = await page.content()
 
+            # Debug: log output length for raw: URLs to diagnose empty DOM issues
+            if is_local_content:
+                self.logger.info(
+                    message="page.content: output_len={output_len}",
+                    tag="RAW_DEBUG",
+                    params={"output_len": len(html) if html else 0},
+                )
+
             await self.execute_hook(
                 "before_return_html", page=page, html=html, context=context, config=config
             )
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 313a4196..8ab9bb49 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -393,6 +393,11 @@ class AsyncWebCrawler:
                             )
 
                     # --- Anti-bot retry setup ---
+                    # raw: URLs contain caller-provided HTML (e.g. from cache),
+                    # not content fetched from a web server.  Anti-bot detection,
+                    # proxy retries, and fallback fetching are meaningless here.
+                    _is_raw_url = url.startswith("raw:") or url.startswith("raw://")
+
                     _max_attempts = 1 + getattr(config, "max_retries", 0)
                     _proxy_list = config._get_proxy_list()
                     _original_proxy_config = config.proxy_config
@@ -490,9 +495,14 @@ class AsyncWebCrawler:
                                 crawl_result.session_id = getattr(config, "session_id", None)
                                 crawl_result.cache_status = "miss"
 
-                                # Check if blocked
-                                _blocked, _block_reason = is_blocked(
-                                    async_response.status_code, html)
+                                # Check if blocked (skip for raw: URLs —
+                                # caller-provided content, anti-bot N/A)
+                                if _is_raw_url:
+                                    _blocked = False
+                                    _block_reason = ""
+                                else:
+                                    _blocked, _block_reason = is_blocked(
+                                        async_response.status_code, html)
 
                                 _crawl_stats["proxies_used"].append({
                                     "proxy": _proxy.server if _proxy else None,
@@ -531,8 +541,9 @@ class AsyncWebCrawler:
                     # --- Fallback fetch function (last resort after all retries+proxies exhausted) ---
                     # Invoke fallback when: (a) crawl_result exists but is blocked, OR
                     # (b) crawl_result is None because all proxies threw exceptions (browser crash, timeout).
+                    # Skip for raw: URLs — fallback expects a real URL, not raw HTML content.
                     _fallback_fn = getattr(config, "fallback_fetch_function", None)
-                    if _fallback_fn and not _done:
+                    if _fallback_fn and not _done and not _is_raw_url:
                         _needs_fallback = (
                             crawl_result is None  # All proxies threw exceptions
                             or is_blocked(crawl_result.status_code, crawl_result.html or "")[0]
@@ -593,8 +604,9 @@ class AsyncWebCrawler:
                     # Skip re-check when fallback was used — the fallback result is
                     # authoritative.  Real pages may contain anti-bot script markers
                     # (e.g. PerimeterX JS on Walmart) that trigger false positives.
+                    # Also skip for raw: URLs — caller-provided content, anti-bot N/A.
                     if crawl_result:
-                        if not _crawl_stats.get("fallback_fetch_used"):
+                        if not _crawl_stats.get("fallback_fetch_used") and not _is_raw_url:
                             _blocked, _block_reason = is_blocked(
                                 crawl_result.status_code, crawl_result.html or "")
                             if _blocked: