diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 576c6319..c213a249 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -12,6 +12,7 @@ from playwright.async_api import TimeoutError as PlaywrightTimeoutError from io import BytesIO from PIL import Image, ImageDraw, ImageFont import hashlib +import random import uuid from .js_snippet import load_js_script from .models import AsyncCrawlResponse @@ -476,19 +477,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if needs_browser: # Route through _crawl_web() for full browser pipeline # _crawl_web() will detect file:// and raw: URLs and use set_content() - if url.startswith("raw"): - _raw_len = len(url) - (6 if url.startswith("raw://") else 4) - _triggers = [k for k in ("process_in_browser", "screenshot", "pdf", - "capture_mhtml", "js_code", "wait_for", "scan_full_page", - "remove_overlay_elements", "remove_consent_popups", - "simulate_user", "magic", "process_iframes", - "capture_console_messages", "capture_network_requests") - if getattr(config, k, None)] - self.logger.info( - message="raw: URL ({len} bytes) routed to browser — triggers: {triggers}", - tag="RAW_DEBUG", - params={"len": _raw_len, "triggers": _triggers}, - ) return await self._crawl_web(url, config) # Fast path: return HTML directly without browser interaction @@ -733,11 +721,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # raw:// or raw: html_content = url[6:] if url.startswith("raw://") else url[4:] - self.logger.info( - message="set_content: input_len={input_len}, wait_until={wait_until}", - tag="RAW_DEBUG", - params={"input_len": len(html_content), "wait_until": config.wait_until}, - ) await page.set_content(html_content, wait_until=config.wait_until) response = None # For raw: URLs, only use base_url if provided; don't fall back to the raw HTML string @@ -966,12 +949,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): params={"error": bw_result.get("error")}, ) - # Handle user simulation + # Handle user simulation — generate mouse movement and scroll + # signals that anti-bot systems look for, without firing keyboard + # events (ArrowDown triggers JS framework navigation) or clicking + # at fixed positions (may hit buttons/links and navigate away). if config.simulate_user or config.magic: - await page.mouse.move(100, 100) - await page.mouse.down() - await page.mouse.up() - await page.keyboard.press("ArrowDown") + await page.mouse.move(random.randint(100, 300), random.randint(150, 300)) + await page.mouse.move(random.randint(300, 600), random.randint(200, 400)) + await page.mouse.wheel(0, random.randint(200, 400)) # --- Phase 2: Wait for page readiness --- @@ -1071,14 +1056,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: html = await page.content() - # Debug: log output length for raw: URLs to diagnose empty DOM issues - if is_local_content: - self.logger.info( - message="page.content: output_len={output_len}", - tag="RAW_DEBUG", - params={"output_len": len(html) if html else 0}, - ) - await self.execute_hook( "before_return_html", page=page, html=html, context=context, config=config )