From 77559f3373b4832ca080a7fe1bf9f00014eaa5d9 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 18 Sep 2025 15:39:06 +0800 Subject: [PATCH] feat(StealthAdapter): fix stealth features for Playwright integration. ref #1481 --- crawl4ai/browser_adapter.py | 128 ++++++++++++++++++++++++++++++++++++ crawl4ai/browser_manager.py | 52 +++++++-------- 2 files changed, 152 insertions(+), 28 deletions(-) diff --git a/crawl4ai/browser_adapter.py b/crawl4ai/browser_adapter.py index 85fef16e..3d3f5cdc 100644 --- a/crawl4ai/browser_adapter.py +++ b/crawl4ai/browser_adapter.py @@ -148,6 +148,134 @@ class PlaywrightAdapter(BrowserAdapter): return Page, Error, PlaywrightTimeoutError +class StealthAdapter(BrowserAdapter): + """Adapter for Playwright with stealth features using playwright_stealth""" + + def __init__(self): + self._console_script_injected = {} + self._stealth_available = self._check_stealth_availability() + + def _check_stealth_availability(self) -> bool: + """Check if playwright_stealth is available and get the correct function""" + try: + from playwright_stealth import stealth_async + self._stealth_function = stealth_async + return True + except ImportError: + try: + from playwright_stealth import stealth_sync + self._stealth_function = stealth_sync + return True + except ImportError: + self._stealth_function = None + return False + + async def apply_stealth(self, page: Page): + """Apply stealth to a page if available""" + if self._stealth_available and self._stealth_function: + try: + if hasattr(self._stealth_function, '__call__'): + if 'async' in getattr(self._stealth_function, '__name__', ''): + await self._stealth_function(page) + else: + self._stealth_function(page) + except Exception as e: + # Fail silently or log error depending on requirements + pass + + async def evaluate(self, page: Page, expression: str, arg: Any = None) -> Any: + """Standard Playwright evaluate with stealth applied""" + if arg is not None: + return await page.evaluate(expression, arg) + return await page.evaluate(expression) + + async def setup_console_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]: + """Setup console capture using Playwright's event system with stealth""" + # Apply stealth to the page first + await self.apply_stealth(page) + + def handle_console_capture(msg): + try: + message_type = "unknown" + try: + message_type = msg.type + except: + pass + + message_text = "unknown" + try: + message_text = msg.text + except: + pass + + entry = { + "type": message_type, + "text": message_text, + "timestamp": time.time() + } + + captured_console.append(entry) + + except Exception as e: + captured_console.append({ + "type": "console_capture_error", + "error": str(e), + "timestamp": time.time() + }) + + page.on("console", handle_console_capture) + return handle_console_capture + + async def setup_error_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]: + """Setup error capture using Playwright's event system""" + def handle_pageerror_capture(err): + try: + error_message = "Unknown error" + try: + error_message = err.message + except: + pass + + error_stack = "" + try: + error_stack = err.stack + except: + pass + + captured_console.append({ + "type": "error", + "text": error_message, + "stack": error_stack, + "timestamp": time.time() + }) + except Exception as e: + captured_console.append({ + "type": "pageerror_capture_error", + "error": str(e), + "timestamp": time.time() + }) + + page.on("pageerror", handle_pageerror_capture) + return handle_pageerror_capture + + async def retrieve_console_messages(self, page: Page) -> List[Dict]: + """Not needed for Playwright - messages are captured via events""" + return [] + + async def cleanup_console_capture(self, page: Page, handle_console: Optional[Callable], handle_error: Optional[Callable]): + """Remove event listeners""" + if handle_console: + page.remove_listener("console", handle_console) + if handle_error: + page.remove_listener("pageerror", handle_error) + + def get_imports(self) -> tuple: + """Return Playwright imports""" + from playwright.async_api import Page, Error + from playwright.async_api import TimeoutError as PlaywrightTimeoutError + return Page, Error, PlaywrightTimeoutError + + class UndetectedAdapter(BrowserAdapter): """Adapter for undetected browser automation with stealth features""" diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 48737fcb..a0fb2673 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -614,9 +614,11 @@ class BrowserManager: # for all racers). Prevents 'Target page/context closed' errors. self._page_lock = asyncio.Lock() - # Stealth-related attributes - self._stealth_instance = None - self._stealth_cm = None + # Stealth adapter for stealth mode + self._stealth_adapter = None + if self.config.enable_stealth and not self.use_undetected: + from .browser_adapter import StealthAdapter + self._stealth_adapter = StealthAdapter() # Initialize ManagedBrowser if needed if self.config.use_managed_browser: @@ -650,16 +652,8 @@ class BrowserManager: else: from playwright.async_api import async_playwright - # Initialize playwright with or without stealth - if self.config.enable_stealth and not self.use_undetected: - # Import stealth only when needed - from playwright_stealth import Stealth - # Use the recommended stealth wrapper approach - self._stealth_instance = Stealth() - self._stealth_cm = self._stealth_instance.use_async(async_playwright()) - self.playwright = await self._stealth_cm.__aenter__() - else: - self.playwright = await async_playwright().start() + # Initialize playwright + self.playwright = await async_playwright().start() if self.config.cdp_url or self.config.use_managed_browser: self.config.use_managed_browser = True @@ -1009,6 +1003,19 @@ class BrowserManager: signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest() return signature_hash + async def _apply_stealth_to_page(self, page): + """Apply stealth to a page if stealth mode is enabled""" + if self._stealth_adapter: + try: + await self._stealth_adapter.apply_stealth(page) + except Exception as e: + if self.logger: + self.logger.warning( + message="Failed to apply stealth to page: {error}", + tag="STEALTH", + params={"error": str(e)} + ) + async def get_page(self, crawlerRunConfig: CrawlerRunConfig): """ Get a page for the given session ID, creating a new one if needed. @@ -1038,6 +1045,7 @@ class BrowserManager: # See GH-1198: context.pages can be empty under races async with self._page_lock: page = await ctx.new_page() + await self._apply_stealth_to_page(page) else: context = self.default_context pages = context.pages @@ -1054,6 +1062,7 @@ class BrowserManager: page = pages[0] else: page = await context.new_page() + await self._apply_stealth_to_page(page) else: # Otherwise, check if we have an existing context for this config config_signature = self._make_config_signature(crawlerRunConfig) @@ -1069,6 +1078,7 @@ class BrowserManager: # Create a new page from the chosen context page = await context.new_page() + await self._apply_stealth_to_page(page) # If a session_id is specified, store this session so we can reuse later if crawlerRunConfig.session_id: @@ -1135,19 +1145,5 @@ class BrowserManager: self.managed_browser = None if self.playwright: - # Handle stealth context manager cleanup if it exists - if hasattr(self, '_stealth_cm') and self._stealth_cm is not None: - try: - await self._stealth_cm.__aexit__(None, None, None) - except Exception as e: - if self.logger: - self.logger.error( - message="Error closing stealth context: {error}", - tag="ERROR", - params={"error": str(e)} - ) - self._stealth_cm = None - self._stealth_instance = None - else: - await self.playwright.stop() + await self.playwright.stop() self.playwright = None