Skip anti-bot checks and fallback for raw: URLs

raw: URLs contain caller-provided HTML (e.g. from cache), not content
fetched from a web server. Anti-bot detection, proxy retries, and
fallback fetching are meaningless for this content.

- Skip is_blocked() in retry loop and final re-check for raw: URLs
- Skip fallback_fetch_function invocation for raw: URLs
- Add RAW_DEBUG logging in browser strategy for set_content/page.content
This commit is contained in:
unclecode
2026-02-19 14:05:56 +00:00
parent 94a77eea30
commit 8df3541ac4
2 changed files with 43 additions and 5 deletions

View File

@@ -476,6 +476,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if needs_browser:
# Route through _crawl_web() for full browser pipeline
# _crawl_web() will detect file:// and raw: URLs and use set_content()
if url.startswith("raw"):
_raw_len = len(url) - (6 if url.startswith("raw://") else 4)
_triggers = [k for k in ("process_in_browser", "screenshot", "pdf",
"capture_mhtml", "js_code", "wait_for", "scan_full_page",
"remove_overlay_elements", "remove_consent_popups",
"simulate_user", "magic", "process_iframes",
"capture_console_messages", "capture_network_requests")
if getattr(config, k, None)]
self.logger.info(
message="raw: URL ({len} bytes) routed to browser — triggers: {triggers}",
tag="RAW_DEBUG",
params={"len": _raw_len, "triggers": _triggers},
)
return await self._crawl_web(url, config)
# Fast path: return HTML directly without browser interaction
@@ -720,6 +733,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# raw:// or raw:
html_content = url[6:] if url.startswith("raw://") else url[4:]
self.logger.info(
message="set_content: input_len={input_len}, wait_until={wait_until}",
tag="RAW_DEBUG",
params={"input_len": len(html_content), "wait_until": config.wait_until},
)
await page.set_content(html_content, wait_until=config.wait_until)
response = None
# For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string
@@ -1053,6 +1071,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
else:
html = await page.content()
# Debug: log output length for raw: URLs to diagnose empty DOM issues
if is_local_content:
self.logger.info(
message="page.content: output_len={output_len}",
tag="RAW_DEBUG",
params={"output_len": len(html) if html else 0},
)
await self.execute_hook(
"before_return_html", page=page, html=html, context=context, config=config
)

View File

@@ -393,6 +393,11 @@ class AsyncWebCrawler:
)
# --- Anti-bot retry setup ---
# raw: URLs contain caller-provided HTML (e.g. from cache),
# not content fetched from a web server. Anti-bot detection,
# proxy retries, and fallback fetching are meaningless here.
_is_raw_url = url.startswith("raw:") or url.startswith("raw://")
_max_attempts = 1 + getattr(config, "max_retries", 0)
_proxy_list = config._get_proxy_list()
_original_proxy_config = config.proxy_config
@@ -490,9 +495,14 @@ class AsyncWebCrawler:
crawl_result.session_id = getattr(config, "session_id", None)
crawl_result.cache_status = "miss"
# Check if blocked
_blocked, _block_reason = is_blocked(
async_response.status_code, html)
# Check if blocked (skip for raw: URLs —
# caller-provided content, anti-bot N/A)
if _is_raw_url:
_blocked = False
_block_reason = ""
else:
_blocked, _block_reason = is_blocked(
async_response.status_code, html)
_crawl_stats["proxies_used"].append({
"proxy": _proxy.server if _proxy else None,
@@ -531,8 +541,9 @@ class AsyncWebCrawler:
# --- Fallback fetch function (last resort after all retries+proxies exhausted) ---
# Invoke fallback when: (a) crawl_result exists but is blocked, OR
# (b) crawl_result is None because all proxies threw exceptions (browser crash, timeout).
# Skip for raw: URLs — fallback expects a real URL, not raw HTML content.
_fallback_fn = getattr(config, "fallback_fetch_function", None)
if _fallback_fn and not _done:
if _fallback_fn and not _done and not _is_raw_url:
_needs_fallback = (
crawl_result is None # All proxies threw exceptions
or is_blocked(crawl_result.status_code, crawl_result.html or "")[0]
@@ -593,8 +604,9 @@ class AsyncWebCrawler:
# Skip re-check when fallback was used — the fallback result is
# authoritative. Real pages may contain anti-bot script markers
# (e.g. PerimeterX JS on Walmart) that trigger false positives.
# Also skip for raw: URLs — caller-provided content, anti-bot N/A.
if crawl_result:
if not _crawl_stats.get("fallback_fetch_used"):
if not _crawl_stats.get("fallback_fetch_used") and not _is_raw_url:
_blocked, _block_reason = is_blocked(
crawl_result.status_code, crawl_result.html or "")
if _blocked: