diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 13a191f0..d96916b4 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -389,7 +389,6 @@ class BrowserConfig: self, browser_type: str = "chromium", headless: bool = True, - stealth: bool = True, browser_mode: str = "dedicated", use_managed_browser: bool = False, cdp_url: str = None, @@ -427,7 +426,6 @@ class BrowserConfig: ): self.browser_type = browser_type self.headless = headless - self.stealth = stealth self.browser_mode = browser_mode self.use_managed_browser = use_managed_browser self.cdp_url = cdp_url @@ -502,7 +500,6 @@ class BrowserConfig: return BrowserConfig( browser_type=kwargs.get("browser_type", "chromium"), headless=kwargs.get("headless", True), - stealth=kwargs.get("stealth", True), browser_mode=kwargs.get("browser_mode", "dedicated"), use_managed_browser=kwargs.get("use_managed_browser", False), cdp_url=kwargs.get("cdp_url"), @@ -539,7 +536,6 @@ class BrowserConfig: result = { "browser_type": self.browser_type, "headless": self.headless, - "stealth": self.stealth, "browser_mode": self.browser_mode, "use_managed_browser": self.use_managed_browser, "cdp_url": self.cdp_url, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 02320dfe..08c69fe3 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -6,13 +6,26 @@ import time from abc import ABC, abstractmethod from typing import Callable, Dict, Any, List, Union from typing import Optional, AsyncGenerator, Final -from playwright_stealth import Stealth import os from playwright.async_api import Page, Error from playwright.async_api import TimeoutError as PlaywrightTimeoutError from io import BytesIO from PIL import Image, ImageDraw, ImageFont import hashlib + +# Backward compatible stealth import +try: + # Try new tf-playwright-stealth API (Stealth class) + from playwright_stealth import Stealth + STEALTH_NEW_API = True +except ImportError: + try: + # Try old playwright-stealth API (stealth_async function) + from playwright_stealth import stealth_async + STEALTH_NEW_API = False + except ImportError: + # No stealth available + STEALTH_NEW_API = None import uuid from .js_snippet import load_js_script from .models import AsyncCrawlResponse @@ -32,6 +45,107 @@ from types import MappingProxyType import contextlib from functools import partial + +# Add StealthConfig class for backward compatibility and new features +class StealthConfig: + """ + Configuration class for stealth settings that works with tf-playwright-stealth. + This maintains backward compatibility while supporting all tf-playwright-stealth features. + """ + def __init__( + self, + # Common settings + enabled: bool = True, + + # Core tf-playwright-stealth parameters (matching the actual library) + chrome_app: bool = True, + chrome_csi: bool = True, + chrome_load_times: bool = True, + chrome_runtime: bool = False, # Note: library default is False + hairline: bool = True, + iframe_content_window: bool = True, + media_codecs: bool = True, + navigator_hardware_concurrency: bool = True, + navigator_languages: bool = True, + navigator_permissions: bool = True, + navigator_platform: bool = True, + navigator_plugins: bool = True, + navigator_user_agent: bool = True, + navigator_vendor: bool = True, + navigator_webdriver: bool = True, + sec_ch_ua: bool = True, + webgl_vendor: bool = True, + + # Override parameters + navigator_languages_override: tuple = ("en-US", "en"), + navigator_platform_override: str = "Win32", + navigator_user_agent_override: str = None, + navigator_vendor_override: str = None, + sec_ch_ua_override: str = None, + webgl_renderer_override: str = None, + webgl_vendor_override: str = None, + + # Advanced parameters + init_scripts_only: bool = False, + script_logging: bool = False, + + # Legacy parameters for backward compatibility + webdriver: bool = None, # This will be mapped to navigator_webdriver + user_agent_override: bool = None, # This will be mapped to navigator_user_agent + window_outerdimensions: bool = None, # This parameter doesn't exist in tf-playwright-stealth + ): + self.enabled = enabled + + # Handle legacy parameter mapping for backward compatibility + if webdriver is not None: + navigator_webdriver = webdriver + if user_agent_override is not None: + navigator_user_agent = user_agent_override + + # Store all stealth options for the Stealth class - filter out None values + self.stealth_options = { + k: v for k, v in { + 'chrome_app': chrome_app, + 'chrome_csi': chrome_csi, + 'chrome_load_times': chrome_load_times, + 'chrome_runtime': chrome_runtime, + 'hairline': hairline, + 'iframe_content_window': iframe_content_window, + 'media_codecs': media_codecs, + 'navigator_hardware_concurrency': navigator_hardware_concurrency, + 'navigator_languages': navigator_languages, + 'navigator_permissions': navigator_permissions, + 'navigator_platform': navigator_platform, + 'navigator_plugins': navigator_plugins, + 'navigator_user_agent': navigator_user_agent, + 'navigator_vendor': navigator_vendor, + 'navigator_webdriver': navigator_webdriver, + 'sec_ch_ua': sec_ch_ua, + 'webgl_vendor': webgl_vendor, + 'navigator_languages_override': navigator_languages_override, + 'navigator_platform_override': navigator_platform_override, + 'navigator_user_agent_override': navigator_user_agent_override, + 'navigator_vendor_override': navigator_vendor_override, + 'sec_ch_ua_override': sec_ch_ua_override, + 'webgl_renderer_override': webgl_renderer_override, + 'webgl_vendor_override': webgl_vendor_override, + 'init_scripts_only': init_scripts_only, + 'script_logging': script_logging, + }.items() if v is not None + } + + @classmethod + def from_dict(cls, config_dict: dict) -> 'StealthConfig': + """Create StealthConfig from dictionary for easy configuration""" + return cls(**config_dict) + + def to_dict(self) -> dict: + """Convert to dictionary for serialization""" + return { + 'enabled': self.enabled, + **self.stealth_options + } + class AsyncCrawlerStrategy(ABC): """ Abstract base class for crawler strategies. @@ -40,7 +154,7 @@ class AsyncCrawlerStrategy(ABC): @abstractmethod async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: - pass # 4 + 3 + pass # 4 + 3 class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ @@ -221,6 +335,79 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ self.headers = headers + async def _apply_stealth(self, page: Page, stealth_config: Optional[StealthConfig] = None): + """ + Apply stealth measures to the page with backward compatibility and enhanced configuration. + + This method automatically applies stealth measures and now supports configuration + through StealthConfig while maintaining backward compatibility. + + Currently supports: + - tf-playwright-stealth (Stealth class with extensive configuration) + - Old playwright-stealth v1.x (stealth_async function) - legacy support + + Args: + page (Page): The Playwright page object + stealth_config (Optional[StealthConfig]): Configuration for stealth settings + """ + if STEALTH_NEW_API is None: + # No stealth library available - silently continue + if self.logger and hasattr(self.logger, 'debug'): + self.logger.debug( + message="playwright-stealth not available, skipping stealth measures", + tag="STEALTH" + ) + return + + # Use default config if none provided + if stealth_config is None: + stealth_config = StealthConfig() + + # Skip if stealth is disabled + if not stealth_config.enabled: + if self.logger and hasattr(self.logger, 'debug'): + self.logger.debug( + message="Stealth measures disabled in configuration", + tag="STEALTH" + ) + return + + try: + if STEALTH_NEW_API: + # Use tf-playwright-stealth API with configuration support + # Filter out any invalid parameters that might cause issues + valid_options = {} + for key, value in stealth_config.stealth_options.items(): + # Accept boolean parameters and specific string/tuple parameters + if isinstance(value, (bool, str, tuple)): + valid_options[key] = value + + stealth = Stealth(**valid_options) + await stealth.apply_stealth_async(page) + + config_info = f"with {len(valid_options)} options" + else: + # Use old API (v1.x) - configuration options are limited + await stealth_async(page) + config_info = "default (v1.x legacy)" + + # Only log if logger is available and in debug mode + if self.logger and hasattr(self.logger, 'debug'): + api_version = "tf-playwright-stealth" if STEALTH_NEW_API else "v1.x" + self.logger.debug( + message="Applied stealth measures using {version} {config}", + tag="STEALTH", + params={"version": api_version, "config": config_info} + ) + except Exception as e: + # Silently continue if stealth fails - don't break the crawling process + if self.logger: + self.logger.warning( + message="Stealth measures failed, continuing without stealth: {error}", + tag="STEALTH", + params={"error": str(e)} + ) + async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): """ Wait for a condition in a smart way. This functions works as below: @@ -533,10 +720,23 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Get page for session page, context = await self.browser_manager.get_page(crawlerRunConfig=config) - # Apply stealth measures if enabled - if self.browser_config.stealth: - stealth = Stealth() - await stealth.apply_stealth_async(page) + # Apply stealth measures automatically (backward compatible) with optional config + # Check multiple possible locations for stealth config for flexibility + stealth_config = None + if hasattr(config, 'stealth_config') and config.stealth_config: + stealth_config = config.stealth_config + elif hasattr(config, 'stealth') and config.stealth: + # Alternative attribute name for backward compatibility + stealth_config = config.stealth if isinstance(config.stealth, StealthConfig) else StealthConfig.from_dict(config.stealth) + elif config.magic: + # Enable more aggressive stealth in magic mode + stealth_config = StealthConfig( + navigator_webdriver=False, # More aggressive stealth + webdriver=False, + chrome_app=False + ) + + await self._apply_stealth(page, stealth_config) # await page.goto(URL) @@ -939,7 +1139,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): tag="VIEWPORT", params={"error": str(e)}, ) - # Handle full page scanning if config.scan_full_page: # await self._handle_full_page_scan(page, config.scroll_delay) @@ -1843,8 +2042,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # }} # }})(); # """ - # ) - # """ NEW VERSION: # When {script} contains statements (e.g., const link = …; link.click();), # this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'. diff --git a/test_stealth_compatibility.py b/test_stealth_compatibility.py index ad1ba02e..37b30171 100644 --- a/test_stealth_compatibility.py +++ b/test_stealth_compatibility.py @@ -1,75 +1,140 @@ #!/usr/bin/env python3 """ -Test suite for playwright-stealth v2.0.0+ compatibility fix. -Tests the stealth implementation update from deprecated stealth_async to Stealth class. +Test suite for playwright-stealth backward compatibility. +Tests that stealth functionality works automatically without user configuration. """ import pytest -from unittest.mock import Mock, patch +import asyncio +from unittest.mock import Mock, patch, MagicMock class TestPlaywrightStealthCompatibility: - """Test playwright-stealth v2.0.0+ compatibility fix""" + """Test playwright-stealth backward compatibility with transparent operation""" - @patch('crawl4ai.async_crawler_strategy.Stealth') - def test_stealth_import_works(self, mock_stealth_class): - """Test that Stealth class can be imported successfully""" - from crawl4ai.async_crawler_strategy import Stealth - - # Should not raise ImportError - assert Stealth is not None - assert mock_stealth_class.called is False # Just checking import, not instantiation + def test_api_detection_works(self): + """Test that API detection works correctly""" + from crawl4ai.async_crawler_strategy import STEALTH_NEW_API + # The value depends on which version is installed, but should not be undefined + assert STEALTH_NEW_API is not None or STEALTH_NEW_API is False or STEALTH_NEW_API is None + @pytest.mark.asyncio + @patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', True) @patch('crawl4ai.async_crawler_strategy.Stealth') - def test_stealth_instantiation_works(self, mock_stealth_class): - """Test that Stealth class can be instantiated""" - from crawl4ai.async_crawler_strategy import Stealth + async def test_apply_stealth_new_api(self, mock_stealth_class): + """Test stealth application with new API works transparently""" + from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy - # Create a mock instance - mock_stealth_instance = Mock() - mock_stealth_class.return_value = mock_stealth_instance - - # This should work without errors - stealth = Stealth() - assert stealth is not None - mock_stealth_class.assert_called_once() - - @patch('crawl4ai.async_crawler_strategy.Stealth') - def test_stealth_has_apply_method(self, mock_stealth_class): - """Test that Stealth instance has apply_stealth_async method""" - from crawl4ai.async_crawler_strategy import Stealth - - # Create a mock instance with apply_stealth_async method + # Setup mock mock_stealth_instance = Mock() mock_stealth_instance.apply_stealth_async = Mock() mock_stealth_class.return_value = mock_stealth_instance - stealth = Stealth() - assert hasattr(stealth, 'apply_stealth_async') - assert callable(stealth.apply_stealth_async) + # Create strategy instance + strategy = AsyncPlaywrightCrawlerStrategy() + + # Mock page + mock_page = Mock() + + # Test the method - should work transparently + await strategy._apply_stealth(mock_page) + + # Verify new API was used + mock_stealth_class.assert_called_once() + mock_stealth_instance.apply_stealth_async.assert_called_once_with(mock_page) - def test_browser_config_has_stealth_flag(self): - """Test that BrowserConfig has stealth flag""" + @pytest.mark.asyncio + @patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', False) + async def test_apply_stealth_legacy_api(self): + """Test stealth application with legacy API works transparently""" + from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + + # Mock stealth_async function by setting it as a module attribute + mock_stealth_async = Mock() + mock_stealth_async.return_value = None + + # Import the module to add the mock function + import crawl4ai.async_crawler_strategy + crawl4ai.async_crawler_strategy.stealth_async = mock_stealth_async + + try: + # Create strategy instance + strategy = AsyncPlaywrightCrawlerStrategy() + + # Mock page + mock_page = Mock() + + # Test the method - should work transparently + await strategy._apply_stealth(mock_page) + + # Verify legacy API was used + mock_stealth_async.assert_called_once_with(mock_page) + finally: + # Clean up + if hasattr(crawl4ai.async_crawler_strategy, 'stealth_async'): + delattr(crawl4ai.async_crawler_strategy, 'stealth_async') + + @pytest.mark.asyncio + @patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', None) + async def test_apply_stealth_no_library(self): + """Test stealth application when no stealth library is available""" + from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + + # Create strategy instance + strategy = AsyncPlaywrightCrawlerStrategy() + + # Mock page + mock_page = Mock() + + # Test the method - should work transparently even without stealth + await strategy._apply_stealth(mock_page) + + # Should complete without error even when no stealth is available + + @pytest.mark.asyncio + @patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', True) + @patch('crawl4ai.async_crawler_strategy.Stealth') + async def test_stealth_error_handling(self, mock_stealth_class): + """Test that stealth errors are handled gracefully without breaking crawling""" + from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + + # Setup mock to raise an error + mock_stealth_instance = Mock() + mock_stealth_instance.apply_stealth_async = Mock(side_effect=Exception("Stealth failed")) + mock_stealth_class.return_value = mock_stealth_instance + + # Create strategy instance + strategy = AsyncPlaywrightCrawlerStrategy() + + # Mock page + mock_page = Mock() + + # Test the method - should not raise an error, continue silently + await strategy._apply_stealth(mock_page) + + # Should complete without raising the stealth error + + def test_strategy_creation_without_config(self): + """Test that strategy can be created without any stealth configuration""" + from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + + # Should work without any stealth-related parameters + strategy = AsyncPlaywrightCrawlerStrategy() + assert strategy is not None + assert hasattr(strategy, '_apply_stealth') + + def test_browser_config_works_without_stealth_param(self): + """Test that BrowserConfig works without stealth parameter""" from crawl4ai.async_configs import BrowserConfig - # Test default value + # Should work without stealth parameter config = BrowserConfig() - assert hasattr(config, 'stealth') - assert config.stealth is True # Default should be True + assert config is not None - # Test explicit setting - config_disabled = BrowserConfig(stealth=False) - assert config_disabled.stealth is False - - def test_stealth_flag_serialization(self): - """Test that stealth flag is properly serialized in BrowserConfig""" - from crawl4ai.async_configs import BrowserConfig - - config = BrowserConfig(stealth=True) - config_dict = config.to_dict() - - assert 'stealth' in config_dict - assert config_dict['stealth'] is True + # Should also work with other parameters + config = BrowserConfig(headless=False, browser_type="firefox") + assert config.headless == False + assert config.browser_type == "firefox" if __name__ == "__main__":