Skip anti-bot checks and fallback for raw: URLs
raw: URLs contain caller-provided HTML (e.g. from cache), not content fetched from a web server. Anti-bot detection, proxy retries, and fallback fetching are meaningless for this content. - Skip is_blocked() in retry loop and final re-check for raw: URLs - Skip fallback_fetch_function invocation for raw: URLs - Add RAW_DEBUG logging in browser strategy for set_content/page.content
This commit is contained in:
@@ -476,6 +476,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
if needs_browser:
|
if needs_browser:
|
||||||
# Route through _crawl_web() for full browser pipeline
|
# Route through _crawl_web() for full browser pipeline
|
||||||
# _crawl_web() will detect file:// and raw: URLs and use set_content()
|
# _crawl_web() will detect file:// and raw: URLs and use set_content()
|
||||||
|
if url.startswith("raw"):
|
||||||
|
_raw_len = len(url) - (6 if url.startswith("raw://") else 4)
|
||||||
|
_triggers = [k for k in ("process_in_browser", "screenshot", "pdf",
|
||||||
|
"capture_mhtml", "js_code", "wait_for", "scan_full_page",
|
||||||
|
"remove_overlay_elements", "remove_consent_popups",
|
||||||
|
"simulate_user", "magic", "process_iframes",
|
||||||
|
"capture_console_messages", "capture_network_requests")
|
||||||
|
if getattr(config, k, None)]
|
||||||
|
self.logger.info(
|
||||||
|
message="raw: URL ({len} bytes) routed to browser — triggers: {triggers}",
|
||||||
|
tag="RAW_DEBUG",
|
||||||
|
params={"len": _raw_len, "triggers": _triggers},
|
||||||
|
)
|
||||||
return await self._crawl_web(url, config)
|
return await self._crawl_web(url, config)
|
||||||
|
|
||||||
# Fast path: return HTML directly without browser interaction
|
# Fast path: return HTML directly without browser interaction
|
||||||
@@ -720,6 +733,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
# raw:// or raw:
|
# raw:// or raw:
|
||||||
html_content = url[6:] if url.startswith("raw://") else url[4:]
|
html_content = url[6:] if url.startswith("raw://") else url[4:]
|
||||||
|
|
||||||
|
self.logger.info(
|
||||||
|
message="set_content: input_len={input_len}, wait_until={wait_until}",
|
||||||
|
tag="RAW_DEBUG",
|
||||||
|
params={"input_len": len(html_content), "wait_until": config.wait_until},
|
||||||
|
)
|
||||||
await page.set_content(html_content, wait_until=config.wait_until)
|
await page.set_content(html_content, wait_until=config.wait_until)
|
||||||
response = None
|
response = None
|
||||||
# For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string
|
# For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string
|
||||||
@@ -1053,6 +1071,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
else:
|
else:
|
||||||
html = await page.content()
|
html = await page.content()
|
||||||
|
|
||||||
|
# Debug: log output length for raw: URLs to diagnose empty DOM issues
|
||||||
|
if is_local_content:
|
||||||
|
self.logger.info(
|
||||||
|
message="page.content: output_len={output_len}",
|
||||||
|
tag="RAW_DEBUG",
|
||||||
|
params={"output_len": len(html) if html else 0},
|
||||||
|
)
|
||||||
|
|
||||||
await self.execute_hook(
|
await self.execute_hook(
|
||||||
"before_return_html", page=page, html=html, context=context, config=config
|
"before_return_html", page=page, html=html, context=context, config=config
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -393,6 +393,11 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# --- Anti-bot retry setup ---
|
# --- Anti-bot retry setup ---
|
||||||
|
# raw: URLs contain caller-provided HTML (e.g. from cache),
|
||||||
|
# not content fetched from a web server. Anti-bot detection,
|
||||||
|
# proxy retries, and fallback fetching are meaningless here.
|
||||||
|
_is_raw_url = url.startswith("raw:") or url.startswith("raw://")
|
||||||
|
|
||||||
_max_attempts = 1 + getattr(config, "max_retries", 0)
|
_max_attempts = 1 + getattr(config, "max_retries", 0)
|
||||||
_proxy_list = config._get_proxy_list()
|
_proxy_list = config._get_proxy_list()
|
||||||
_original_proxy_config = config.proxy_config
|
_original_proxy_config = config.proxy_config
|
||||||
@@ -490,7 +495,12 @@ class AsyncWebCrawler:
|
|||||||
crawl_result.session_id = getattr(config, "session_id", None)
|
crawl_result.session_id = getattr(config, "session_id", None)
|
||||||
crawl_result.cache_status = "miss"
|
crawl_result.cache_status = "miss"
|
||||||
|
|
||||||
# Check if blocked
|
# Check if blocked (skip for raw: URLs —
|
||||||
|
# caller-provided content, anti-bot N/A)
|
||||||
|
if _is_raw_url:
|
||||||
|
_blocked = False
|
||||||
|
_block_reason = ""
|
||||||
|
else:
|
||||||
_blocked, _block_reason = is_blocked(
|
_blocked, _block_reason = is_blocked(
|
||||||
async_response.status_code, html)
|
async_response.status_code, html)
|
||||||
|
|
||||||
@@ -531,8 +541,9 @@ class AsyncWebCrawler:
|
|||||||
# --- Fallback fetch function (last resort after all retries+proxies exhausted) ---
|
# --- Fallback fetch function (last resort after all retries+proxies exhausted) ---
|
||||||
# Invoke fallback when: (a) crawl_result exists but is blocked, OR
|
# Invoke fallback when: (a) crawl_result exists but is blocked, OR
|
||||||
# (b) crawl_result is None because all proxies threw exceptions (browser crash, timeout).
|
# (b) crawl_result is None because all proxies threw exceptions (browser crash, timeout).
|
||||||
|
# Skip for raw: URLs — fallback expects a real URL, not raw HTML content.
|
||||||
_fallback_fn = getattr(config, "fallback_fetch_function", None)
|
_fallback_fn = getattr(config, "fallback_fetch_function", None)
|
||||||
if _fallback_fn and not _done:
|
if _fallback_fn and not _done and not _is_raw_url:
|
||||||
_needs_fallback = (
|
_needs_fallback = (
|
||||||
crawl_result is None # All proxies threw exceptions
|
crawl_result is None # All proxies threw exceptions
|
||||||
or is_blocked(crawl_result.status_code, crawl_result.html or "")[0]
|
or is_blocked(crawl_result.status_code, crawl_result.html or "")[0]
|
||||||
@@ -593,8 +604,9 @@ class AsyncWebCrawler:
|
|||||||
# Skip re-check when fallback was used — the fallback result is
|
# Skip re-check when fallback was used — the fallback result is
|
||||||
# authoritative. Real pages may contain anti-bot script markers
|
# authoritative. Real pages may contain anti-bot script markers
|
||||||
# (e.g. PerimeterX JS on Walmart) that trigger false positives.
|
# (e.g. PerimeterX JS on Walmart) that trigger false positives.
|
||||||
|
# Also skip for raw: URLs — caller-provided content, anti-bot N/A.
|
||||||
if crawl_result:
|
if crawl_result:
|
||||||
if not _crawl_stats.get("fallback_fetch_used"):
|
if not _crawl_stats.get("fallback_fetch_used") and not _is_raw_url:
|
||||||
_blocked, _block_reason = is_blocked(
|
_blocked, _block_reason = is_blocked(
|
||||||
crawl_result.status_code, crawl_result.html or "")
|
crawl_result.status_code, crawl_result.html or "")
|
||||||
if _blocked:
|
if _blocked:
|
||||||
|
|||||||
Reference in New Issue
Block a user