Fix simulate_user destroying page content via ArrowDown keypress
The simulate_user/magic block fired keyboard.press("ArrowDown") and
mouse.click at fixed coords (100,100) on every page. This destroyed
content on any site with keyboard event handlers (MkDocs, React Router,
Vue, Angular, SPAs) and clicked interactive elements at that position.
Replace with mouse movement (two-point trajectory) and mouse.wheel
scroll — these generate the same anti-bot signals (mousemove, scroll
events) without triggering JS framework navigation or clicking buttons.
Also remove temporary RAW_DEBUG logging that was left from investigation.
This commit is contained in:
@@ -12,6 +12,7 @@ from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||
from io import BytesIO
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
import hashlib
|
||||
import random
|
||||
import uuid
|
||||
from .js_snippet import load_js_script
|
||||
from .models import AsyncCrawlResponse
|
||||
@@ -476,19 +477,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if needs_browser:
|
||||
# Route through _crawl_web() for full browser pipeline
|
||||
# _crawl_web() will detect file:// and raw: URLs and use set_content()
|
||||
if url.startswith("raw"):
|
||||
_raw_len = len(url) - (6 if url.startswith("raw://") else 4)
|
||||
_triggers = [k for k in ("process_in_browser", "screenshot", "pdf",
|
||||
"capture_mhtml", "js_code", "wait_for", "scan_full_page",
|
||||
"remove_overlay_elements", "remove_consent_popups",
|
||||
"simulate_user", "magic", "process_iframes",
|
||||
"capture_console_messages", "capture_network_requests")
|
||||
if getattr(config, k, None)]
|
||||
self.logger.info(
|
||||
message="raw: URL ({len} bytes) routed to browser — triggers: {triggers}",
|
||||
tag="RAW_DEBUG",
|
||||
params={"len": _raw_len, "triggers": _triggers},
|
||||
)
|
||||
return await self._crawl_web(url, config)
|
||||
|
||||
# Fast path: return HTML directly without browser interaction
|
||||
@@ -733,11 +721,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# raw:// or raw:
|
||||
html_content = url[6:] if url.startswith("raw://") else url[4:]
|
||||
|
||||
self.logger.info(
|
||||
message="set_content: input_len={input_len}, wait_until={wait_until}",
|
||||
tag="RAW_DEBUG",
|
||||
params={"input_len": len(html_content), "wait_until": config.wait_until},
|
||||
)
|
||||
await page.set_content(html_content, wait_until=config.wait_until)
|
||||
response = None
|
||||
# For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string
|
||||
@@ -966,12 +949,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
params={"error": bw_result.get("error")},
|
||||
)
|
||||
|
||||
# Handle user simulation
|
||||
# Handle user simulation — generate mouse movement and scroll
|
||||
# signals that anti-bot systems look for, without firing keyboard
|
||||
# events (ArrowDown triggers JS framework navigation) or clicking
|
||||
# at fixed positions (may hit buttons/links and navigate away).
|
||||
if config.simulate_user or config.magic:
|
||||
await page.mouse.move(100, 100)
|
||||
await page.mouse.down()
|
||||
await page.mouse.up()
|
||||
await page.keyboard.press("ArrowDown")
|
||||
await page.mouse.move(random.randint(100, 300), random.randint(150, 300))
|
||||
await page.mouse.move(random.randint(300, 600), random.randint(200, 400))
|
||||
await page.mouse.wheel(0, random.randint(200, 400))
|
||||
|
||||
# --- Phase 2: Wait for page readiness ---
|
||||
|
||||
@@ -1071,14 +1056,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
else:
|
||||
html = await page.content()
|
||||
|
||||
# Debug: log output length for raw: URLs to diagnose empty DOM issues
|
||||
if is_local_content:
|
||||
self.logger.info(
|
||||
message="page.content: output_len={output_len}",
|
||||
tag="RAW_DEBUG",
|
||||
params={"output_len": len(html) if html else 0},
|
||||
)
|
||||
|
||||
await self.execute_hook(
|
||||
"before_return_html", page=page, html=html, context=context, config=config
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user