Fix simulate_user destroying page content via ArrowDown keypress

The simulate_user/magic block fired keyboard.press("ArrowDown") and
mouse.click at fixed coords (100,100) on every page. This destroyed
content on any site with keyboard event handlers (MkDocs, React Router,
Vue, Angular, SPAs) and clicked interactive elements at that position.

Replace with mouse movement (two-point trajectory) and mouse.wheel
scroll — these generate the same anti-bot signals (mousemove, scroll
events) without triggering JS framework navigation or clicking buttons.

Also remove temporary RAW_DEBUG logging that was left from investigation.
This commit is contained in:
unclecode
2026-02-19 15:03:28 +00:00
parent 8df3541ac4
commit c854e2b899

View File

@@ -12,6 +12,7 @@ from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont
import hashlib
import random
import uuid
from .js_snippet import load_js_script
from .models import AsyncCrawlResponse
@@ -476,19 +477,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if needs_browser:
# Route through _crawl_web() for full browser pipeline
# _crawl_web() will detect file:// and raw: URLs and use set_content()
if url.startswith("raw"):
_raw_len = len(url) - (6 if url.startswith("raw://") else 4)
_triggers = [k for k in ("process_in_browser", "screenshot", "pdf",
"capture_mhtml", "js_code", "wait_for", "scan_full_page",
"remove_overlay_elements", "remove_consent_popups",
"simulate_user", "magic", "process_iframes",
"capture_console_messages", "capture_network_requests")
if getattr(config, k, None)]
self.logger.info(
message="raw: URL ({len} bytes) routed to browser — triggers: {triggers}",
tag="RAW_DEBUG",
params={"len": _raw_len, "triggers": _triggers},
)
return await self._crawl_web(url, config)
# Fast path: return HTML directly without browser interaction
@@ -733,11 +721,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# raw:// or raw:
html_content = url[6:] if url.startswith("raw://") else url[4:]
self.logger.info(
message="set_content: input_len={input_len}, wait_until={wait_until}",
tag="RAW_DEBUG",
params={"input_len": len(html_content), "wait_until": config.wait_until},
)
await page.set_content(html_content, wait_until=config.wait_until)
response = None
# For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string
@@ -966,12 +949,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
params={"error": bw_result.get("error")},
)
# Handle user simulation
# Handle user simulation — generate mouse movement and scroll
# signals that anti-bot systems look for, without firing keyboard
# events (ArrowDown triggers JS framework navigation) or clicking
# at fixed positions (may hit buttons/links and navigate away).
if config.simulate_user or config.magic:
await page.mouse.move(100, 100)
await page.mouse.down()
await page.mouse.up()
await page.keyboard.press("ArrowDown")
await page.mouse.move(random.randint(100, 300), random.randint(150, 300))
await page.mouse.move(random.randint(300, 600), random.randint(200, 400))
await page.mouse.wheel(0, random.randint(200, 400))
# --- Phase 2: Wait for page readiness ---
@@ -1071,14 +1056,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
else:
html = await page.content()
# Debug: log output length for raw: URLs to diagnose empty DOM issues
if is_local_content:
self.logger.info(
message="page.content: output_len={output_len}",
tag="RAW_DEBUG",
params={"output_len": len(html) if html else 0},
)
await self.execute_hook(
"before_return_html", page=page, html=html, context=context, config=config
)