From c854e2b899ea3061eb9a04dfb951ee40202058d6 Mon Sep 17 00:00:00 2001 From: unclecode Date: Thu, 19 Feb 2026 15:03:28 +0000 Subject: [PATCH] Fix simulate_user destroying page content via ArrowDown keypress MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The simulate_user/magic block fired keyboard.press("ArrowDown") and mouse.click at fixed coords (100,100) on every page. This destroyed content on any site with keyboard event handlers (MkDocs, React Router, Vue, Angular, SPAs) and clicked interactive elements at that position. Replace with mouse movement (two-point trajectory) and mouse.wheel scroll — these generate the same anti-bot signals (mousemove, scroll events) without triggering JS framework navigation or clicking buttons. Also remove temporary RAW_DEBUG logging that was left from investigation. --- crawl4ai/async_crawler_strategy.py | 39 ++++++------------------------ 1 file changed, 8 insertions(+), 31 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 576c6319..c213a249 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -12,6 +12,7 @@ from playwright.async_api import TimeoutError as PlaywrightTimeoutError from io import BytesIO from PIL import Image, ImageDraw, ImageFont import hashlib +import random import uuid from .js_snippet import load_js_script from .models import AsyncCrawlResponse @@ -476,19 +477,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if needs_browser: # Route through _crawl_web() for full browser pipeline # _crawl_web() will detect file:// and raw: URLs and use set_content() - if url.startswith("raw"): - _raw_len = len(url) - (6 if url.startswith("raw://") else 4) - _triggers = [k for k in ("process_in_browser", "screenshot", "pdf", - "capture_mhtml", "js_code", "wait_for", "scan_full_page", - "remove_overlay_elements", "remove_consent_popups", - "simulate_user", "magic", "process_iframes", - "capture_console_messages", "capture_network_requests") - if getattr(config, k, None)] - self.logger.info( - message="raw: URL ({len} bytes) routed to browser — triggers: {triggers}", - tag="RAW_DEBUG", - params={"len": _raw_len, "triggers": _triggers}, - ) return await self._crawl_web(url, config) # Fast path: return HTML directly without browser interaction @@ -733,11 +721,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # raw:// or raw: html_content = url[6:] if url.startswith("raw://") else url[4:] - self.logger.info( - message="set_content: input_len={input_len}, wait_until={wait_until}", - tag="RAW_DEBUG", - params={"input_len": len(html_content), "wait_until": config.wait_until}, - ) await page.set_content(html_content, wait_until=config.wait_until) response = None # For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string @@ -966,12 +949,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): params={"error": bw_result.get("error")}, ) - # Handle user simulation + # Handle user simulation — generate mouse movement and scroll + # signals that anti-bot systems look for, without firing keyboard + # events (ArrowDown triggers JS framework navigation) or clicking + # at fixed positions (may hit buttons/links and navigate away). if config.simulate_user or config.magic: - await page.mouse.move(100, 100) - await page.mouse.down() - await page.mouse.up() - await page.keyboard.press("ArrowDown") + await page.mouse.move(random.randint(100, 300), random.randint(150, 300)) + await page.mouse.move(random.randint(300, 600), random.randint(200, 400)) + await page.mouse.wheel(0, random.randint(200, 400)) # --- Phase 2: Wait for page readiness --- @@ -1071,14 +1056,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: html = await page.content() - # Debug: log output length for raw: URLs to diagnose empty DOM issues - if is_local_content: - self.logger.info( - message="page.content: output_len={output_len}", - tag="RAW_DEBUG", - params={"output_len": len(html) if html else 0}, - ) - await self.execute_hook( "before_return_html", page=page, html=html, context=context, config=config )