Skip anti-bot checks and fallback for raw: URLs
raw: URLs contain caller-provided HTML (e.g. from cache), not content fetched from a web server. Anti-bot detection, proxy retries, and fallback fetching are meaningless for this content. - Skip is_blocked() in retry loop and final re-check for raw: URLs - Skip fallback_fetch_function invocation for raw: URLs - Add RAW_DEBUG logging in browser strategy for set_content/page.content
This commit is contained in:
@@ -476,6 +476,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if needs_browser:
|
||||
# Route through _crawl_web() for full browser pipeline
|
||||
# _crawl_web() will detect file:// and raw: URLs and use set_content()
|
||||
if url.startswith("raw"):
|
||||
_raw_len = len(url) - (6 if url.startswith("raw://") else 4)
|
||||
_triggers = [k for k in ("process_in_browser", "screenshot", "pdf",
|
||||
"capture_mhtml", "js_code", "wait_for", "scan_full_page",
|
||||
"remove_overlay_elements", "remove_consent_popups",
|
||||
"simulate_user", "magic", "process_iframes",
|
||||
"capture_console_messages", "capture_network_requests")
|
||||
if getattr(config, k, None)]
|
||||
self.logger.info(
|
||||
message="raw: URL ({len} bytes) routed to browser — triggers: {triggers}",
|
||||
tag="RAW_DEBUG",
|
||||
params={"len": _raw_len, "triggers": _triggers},
|
||||
)
|
||||
return await self._crawl_web(url, config)
|
||||
|
||||
# Fast path: return HTML directly without browser interaction
|
||||
@@ -720,6 +733,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# raw:// or raw:
|
||||
html_content = url[6:] if url.startswith("raw://") else url[4:]
|
||||
|
||||
self.logger.info(
|
||||
message="set_content: input_len={input_len}, wait_until={wait_until}",
|
||||
tag="RAW_DEBUG",
|
||||
params={"input_len": len(html_content), "wait_until": config.wait_until},
|
||||
)
|
||||
await page.set_content(html_content, wait_until=config.wait_until)
|
||||
response = None
|
||||
# For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string
|
||||
@@ -1053,6 +1071,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
else:
|
||||
html = await page.content()
|
||||
|
||||
# Debug: log output length for raw: URLs to diagnose empty DOM issues
|
||||
if is_local_content:
|
||||
self.logger.info(
|
||||
message="page.content: output_len={output_len}",
|
||||
tag="RAW_DEBUG",
|
||||
params={"output_len": len(html) if html else 0},
|
||||
)
|
||||
|
||||
await self.execute_hook(
|
||||
"before_return_html", page=page, html=html, context=context, config=config
|
||||
)
|
||||
|
||||
@@ -393,6 +393,11 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
# --- Anti-bot retry setup ---
|
||||
# raw: URLs contain caller-provided HTML (e.g. from cache),
|
||||
# not content fetched from a web server. Anti-bot detection,
|
||||
# proxy retries, and fallback fetching are meaningless here.
|
||||
_is_raw_url = url.startswith("raw:") or url.startswith("raw://")
|
||||
|
||||
_max_attempts = 1 + getattr(config, "max_retries", 0)
|
||||
_proxy_list = config._get_proxy_list()
|
||||
_original_proxy_config = config.proxy_config
|
||||
@@ -490,9 +495,14 @@ class AsyncWebCrawler:
|
||||
crawl_result.session_id = getattr(config, "session_id", None)
|
||||
crawl_result.cache_status = "miss"
|
||||
|
||||
# Check if blocked
|
||||
_blocked, _block_reason = is_blocked(
|
||||
async_response.status_code, html)
|
||||
# Check if blocked (skip for raw: URLs —
|
||||
# caller-provided content, anti-bot N/A)
|
||||
if _is_raw_url:
|
||||
_blocked = False
|
||||
_block_reason = ""
|
||||
else:
|
||||
_blocked, _block_reason = is_blocked(
|
||||
async_response.status_code, html)
|
||||
|
||||
_crawl_stats["proxies_used"].append({
|
||||
"proxy": _proxy.server if _proxy else None,
|
||||
@@ -531,8 +541,9 @@ class AsyncWebCrawler:
|
||||
# --- Fallback fetch function (last resort after all retries+proxies exhausted) ---
|
||||
# Invoke fallback when: (a) crawl_result exists but is blocked, OR
|
||||
# (b) crawl_result is None because all proxies threw exceptions (browser crash, timeout).
|
||||
# Skip for raw: URLs — fallback expects a real URL, not raw HTML content.
|
||||
_fallback_fn = getattr(config, "fallback_fetch_function", None)
|
||||
if _fallback_fn and not _done:
|
||||
if _fallback_fn and not _done and not _is_raw_url:
|
||||
_needs_fallback = (
|
||||
crawl_result is None # All proxies threw exceptions
|
||||
or is_blocked(crawl_result.status_code, crawl_result.html or "")[0]
|
||||
@@ -593,8 +604,9 @@ class AsyncWebCrawler:
|
||||
# Skip re-check when fallback was used — the fallback result is
|
||||
# authoritative. Real pages may contain anti-bot script markers
|
||||
# (e.g. PerimeterX JS on Walmart) that trigger false positives.
|
||||
# Also skip for raw: URLs — caller-provided content, anti-bot N/A.
|
||||
if crawl_result:
|
||||
if not _crawl_stats.get("fallback_fetch_used"):
|
||||
if not _crawl_stats.get("fallback_fetch_used") and not _is_raw_url:
|
||||
_blocked, _block_reason = is_blocked(
|
||||
crawl_result.status_code, crawl_result.html or "")
|
||||
if _blocked:
|
||||
|
||||
Reference in New Issue
Block a user