Skip anti-bot checks and fallback for raw: URLs

raw: URLs contain caller-provided HTML (e.g. from cache), not content fetched from a web server. Anti-bot detection, proxy retries, and fallback fetching are meaningless for this content. - Skip is_blocked() in retry loop and final re-check for raw: URLs - Skip fallback_fetch_function invocation for raw: URLs - Add RAW_DEBUG logging in browser strategy for set_content/page.content
2026-02-19 14:05:56 +00:00
parent 94a77eea30
commit 8df3541ac4
2 changed files with 43 additions and 5 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -476,6 +476,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            if needs_browser:
                # Route through _crawl_web() for full browser pipeline
                # _crawl_web() will detect file:// and raw: URLs and use set_content()
                if url.startswith("raw"):
                    _raw_len = len(url) - (6 if url.startswith("raw://") else 4)
                    _triggers = [k for k in ("process_in_browser", "screenshot", "pdf",
                        "capture_mhtml", "js_code", "wait_for", "scan_full_page",
                        "remove_overlay_elements", "remove_consent_popups",
                        "simulate_user", "magic", "process_iframes",
                        "capture_console_messages", "capture_network_requests")
                        if getattr(config, k, None)]
                    self.logger.info(
                        message="raw: URL ({len} bytes) routed to browser — triggers: {triggers}",
                        tag="RAW_DEBUG",
                        params={"len": _raw_len, "triggers": _triggers},
                    )
                return await self._crawl_web(url, config)
            # Fast path: return HTML directly without browser interaction
@@ -720,6 +733,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        # raw:// or raw:
                        html_content = url[6:] if url.startswith("raw://") else url[4:]
                    self.logger.info(
                        message="set_content: input_len={input_len}, wait_until={wait_until}",
                        tag="RAW_DEBUG",
                        params={"input_len": len(html_content), "wait_until": config.wait_until},
                    )
                    await page.set_content(html_content, wait_until=config.wait_until)
                    response = None
                    # For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string
@@ -1053,6 +1071,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            else:
                html = await page.content()
            # Debug: log output length for raw: URLs to diagnose empty DOM issues
            if is_local_content:
                self.logger.info(
                    message="page.content: output_len={output_len}",
                    tag="RAW_DEBUG",
                    params={"output_len": len(html) if html else 0},
                )
            await self.execute_hook(
                "before_return_html", page=page, html=html, context=context, config=config
            )
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -393,6 +393,11 @@ class AsyncWebCrawler:
                            )
                    # --- Anti-bot retry setup ---
                    # raw: URLs contain caller-provided HTML (e.g. from cache),
                    # not content fetched from a web server.  Anti-bot detection,
                    # proxy retries, and fallback fetching are meaningless here.
                    _is_raw_url = url.startswith("raw:") or url.startswith("raw://")
                    _max_attempts = 1 + getattr(config, "max_retries", 0)
                    _proxy_list = config._get_proxy_list()
                    _original_proxy_config = config.proxy_config
@@ -490,7 +495,12 @@ class AsyncWebCrawler:
                                crawl_result.session_id = getattr(config, "session_id", None)
                                crawl_result.cache_status = "miss"
-                                # Check if blocked
+                                # Check if blocked (skip for raw: URLs —
                                # caller-provided content, anti-bot N/A)
                                if _is_raw_url:
                                    _blocked = False
                                    _block_reason = ""
                                else:
                                    _blocked, _block_reason = is_blocked(
                                        async_response.status_code, html)
@@ -531,8 +541,9 @@ class AsyncWebCrawler:
                    # --- Fallback fetch function (last resort after all retries+proxies exhausted) ---
                    # Invoke fallback when: (a) crawl_result exists but is blocked, OR
                    # (b) crawl_result is None because all proxies threw exceptions (browser crash, timeout).
                    # Skip for raw: URLs — fallback expects a real URL, not raw HTML content.
                    _fallback_fn = getattr(config, "fallback_fetch_function", None)
-                    if _fallback_fn and not _done:
+                    if _fallback_fn and not _done and not _is_raw_url:
                        _needs_fallback = (
                            crawl_result is None  # All proxies threw exceptions
                            or is_blocked(crawl_result.status_code, crawl_result.html or "")[0]
@@ -593,8 +604,9 @@ class AsyncWebCrawler:
                    # Skip re-check when fallback was used — the fallback result is
                    # authoritative.  Real pages may contain anti-bot script markers
                    # (e.g. PerimeterX JS on Walmart) that trigger false positives.
                    # Also skip for raw: URLs — caller-provided content, anti-bot N/A.
                    if crawl_result:
-                        if not _crawl_stats.get("fallback_fetch_used"):
+                        if not _crawl_stats.get("fallback_fetch_used") and not _is_raw_url:
                            _blocked, _block_reason = is_blocked(
                                crawl_result.status_code, crawl_result.html or "")
                            if _blocked: