feat: Enhance stealth compatibility with new and legacy APIs, add configuration support

This commit is contained in:
AHMET YILMAZ
2025-07-16 17:41:47 +08:00
parent 5c13baf574
commit 65902a4773
3 changed files with 322 additions and 64 deletions

View File

@@ -389,7 +389,6 @@ class BrowserConfig:
self, self,
browser_type: str = "chromium", browser_type: str = "chromium",
headless: bool = True, headless: bool = True,
stealth: bool = True,
browser_mode: str = "dedicated", browser_mode: str = "dedicated",
use_managed_browser: bool = False, use_managed_browser: bool = False,
cdp_url: str = None, cdp_url: str = None,
@@ -427,7 +426,6 @@ class BrowserConfig:
): ):
self.browser_type = browser_type self.browser_type = browser_type
self.headless = headless self.headless = headless
self.stealth = stealth
self.browser_mode = browser_mode self.browser_mode = browser_mode
self.use_managed_browser = use_managed_browser self.use_managed_browser = use_managed_browser
self.cdp_url = cdp_url self.cdp_url = cdp_url
@@ -502,7 +500,6 @@ class BrowserConfig:
return BrowserConfig( return BrowserConfig(
browser_type=kwargs.get("browser_type", "chromium"), browser_type=kwargs.get("browser_type", "chromium"),
headless=kwargs.get("headless", True), headless=kwargs.get("headless", True),
stealth=kwargs.get("stealth", True),
browser_mode=kwargs.get("browser_mode", "dedicated"), browser_mode=kwargs.get("browser_mode", "dedicated"),
use_managed_browser=kwargs.get("use_managed_browser", False), use_managed_browser=kwargs.get("use_managed_browser", False),
cdp_url=kwargs.get("cdp_url"), cdp_url=kwargs.get("cdp_url"),
@@ -539,7 +536,6 @@ class BrowserConfig:
result = { result = {
"browser_type": self.browser_type, "browser_type": self.browser_type,
"headless": self.headless, "headless": self.headless,
"stealth": self.stealth,
"browser_mode": self.browser_mode, "browser_mode": self.browser_mode,
"use_managed_browser": self.use_managed_browser, "use_managed_browser": self.use_managed_browser,
"cdp_url": self.cdp_url, "cdp_url": self.cdp_url,

View File

@@ -6,13 +6,26 @@ import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Callable, Dict, Any, List, Union from typing import Callable, Dict, Any, List, Union
from typing import Optional, AsyncGenerator, Final from typing import Optional, AsyncGenerator, Final
from playwright_stealth import Stealth
import os import os
from playwright.async_api import Page, Error from playwright.async_api import Page, Error
from playwright.async_api import TimeoutError as PlaywrightTimeoutError from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from io import BytesIO from io import BytesIO
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
import hashlib import hashlib
# Backward compatible stealth import
try:
# Try new tf-playwright-stealth API (Stealth class)
from playwright_stealth import Stealth
STEALTH_NEW_API = True
except ImportError:
try:
# Try old playwright-stealth API (stealth_async function)
from playwright_stealth import stealth_async
STEALTH_NEW_API = False
except ImportError:
# No stealth available
STEALTH_NEW_API = None
import uuid import uuid
from .js_snippet import load_js_script from .js_snippet import load_js_script
from .models import AsyncCrawlResponse from .models import AsyncCrawlResponse
@@ -32,6 +45,107 @@ from types import MappingProxyType
import contextlib import contextlib
from functools import partial from functools import partial
# Add StealthConfig class for backward compatibility and new features
class StealthConfig:
"""
Configuration class for stealth settings that works with tf-playwright-stealth.
This maintains backward compatibility while supporting all tf-playwright-stealth features.
"""
def __init__(
self,
# Common settings
enabled: bool = True,
# Core tf-playwright-stealth parameters (matching the actual library)
chrome_app: bool = True,
chrome_csi: bool = True,
chrome_load_times: bool = True,
chrome_runtime: bool = False, # Note: library default is False
hairline: bool = True,
iframe_content_window: bool = True,
media_codecs: bool = True,
navigator_hardware_concurrency: bool = True,
navigator_languages: bool = True,
navigator_permissions: bool = True,
navigator_platform: bool = True,
navigator_plugins: bool = True,
navigator_user_agent: bool = True,
navigator_vendor: bool = True,
navigator_webdriver: bool = True,
sec_ch_ua: bool = True,
webgl_vendor: bool = True,
# Override parameters
navigator_languages_override: tuple = ("en-US", "en"),
navigator_platform_override: str = "Win32",
navigator_user_agent_override: str = None,
navigator_vendor_override: str = None,
sec_ch_ua_override: str = None,
webgl_renderer_override: str = None,
webgl_vendor_override: str = None,
# Advanced parameters
init_scripts_only: bool = False,
script_logging: bool = False,
# Legacy parameters for backward compatibility
webdriver: bool = None, # This will be mapped to navigator_webdriver
user_agent_override: bool = None, # This will be mapped to navigator_user_agent
window_outerdimensions: bool = None, # This parameter doesn't exist in tf-playwright-stealth
):
self.enabled = enabled
# Handle legacy parameter mapping for backward compatibility
if webdriver is not None:
navigator_webdriver = webdriver
if user_agent_override is not None:
navigator_user_agent = user_agent_override
# Store all stealth options for the Stealth class - filter out None values
self.stealth_options = {
k: v for k, v in {
'chrome_app': chrome_app,
'chrome_csi': chrome_csi,
'chrome_load_times': chrome_load_times,
'chrome_runtime': chrome_runtime,
'hairline': hairline,
'iframe_content_window': iframe_content_window,
'media_codecs': media_codecs,
'navigator_hardware_concurrency': navigator_hardware_concurrency,
'navigator_languages': navigator_languages,
'navigator_permissions': navigator_permissions,
'navigator_platform': navigator_platform,
'navigator_plugins': navigator_plugins,
'navigator_user_agent': navigator_user_agent,
'navigator_vendor': navigator_vendor,
'navigator_webdriver': navigator_webdriver,
'sec_ch_ua': sec_ch_ua,
'webgl_vendor': webgl_vendor,
'navigator_languages_override': navigator_languages_override,
'navigator_platform_override': navigator_platform_override,
'navigator_user_agent_override': navigator_user_agent_override,
'navigator_vendor_override': navigator_vendor_override,
'sec_ch_ua_override': sec_ch_ua_override,
'webgl_renderer_override': webgl_renderer_override,
'webgl_vendor_override': webgl_vendor_override,
'init_scripts_only': init_scripts_only,
'script_logging': script_logging,
}.items() if v is not None
}
@classmethod
def from_dict(cls, config_dict: dict) -> 'StealthConfig':
"""Create StealthConfig from dictionary for easy configuration"""
return cls(**config_dict)
def to_dict(self) -> dict:
"""Convert to dictionary for serialization"""
return {
'enabled': self.enabled,
**self.stealth_options
}
class AsyncCrawlerStrategy(ABC): class AsyncCrawlerStrategy(ABC):
""" """
Abstract base class for crawler strategies. Abstract base class for crawler strategies.
@@ -40,7 +154,7 @@ class AsyncCrawlerStrategy(ABC):
@abstractmethod @abstractmethod
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
pass # 4 + 3 pass # 4 + 3
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
""" """
@@ -221,6 +335,79 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
""" """
self.headers = headers self.headers = headers
async def _apply_stealth(self, page: Page, stealth_config: Optional[StealthConfig] = None):
"""
Apply stealth measures to the page with backward compatibility and enhanced configuration.
This method automatically applies stealth measures and now supports configuration
through StealthConfig while maintaining backward compatibility.
Currently supports:
- tf-playwright-stealth (Stealth class with extensive configuration)
- Old playwright-stealth v1.x (stealth_async function) - legacy support
Args:
page (Page): The Playwright page object
stealth_config (Optional[StealthConfig]): Configuration for stealth settings
"""
if STEALTH_NEW_API is None:
# No stealth library available - silently continue
if self.logger and hasattr(self.logger, 'debug'):
self.logger.debug(
message="playwright-stealth not available, skipping stealth measures",
tag="STEALTH"
)
return
# Use default config if none provided
if stealth_config is None:
stealth_config = StealthConfig()
# Skip if stealth is disabled
if not stealth_config.enabled:
if self.logger and hasattr(self.logger, 'debug'):
self.logger.debug(
message="Stealth measures disabled in configuration",
tag="STEALTH"
)
return
try:
if STEALTH_NEW_API:
# Use tf-playwright-stealth API with configuration support
# Filter out any invalid parameters that might cause issues
valid_options = {}
for key, value in stealth_config.stealth_options.items():
# Accept boolean parameters and specific string/tuple parameters
if isinstance(value, (bool, str, tuple)):
valid_options[key] = value
stealth = Stealth(**valid_options)
await stealth.apply_stealth_async(page)
config_info = f"with {len(valid_options)} options"
else:
# Use old API (v1.x) - configuration options are limited
await stealth_async(page)
config_info = "default (v1.x legacy)"
# Only log if logger is available and in debug mode
if self.logger and hasattr(self.logger, 'debug'):
api_version = "tf-playwright-stealth" if STEALTH_NEW_API else "v1.x"
self.logger.debug(
message="Applied stealth measures using {version} {config}",
tag="STEALTH",
params={"version": api_version, "config": config_info}
)
except Exception as e:
# Silently continue if stealth fails - don't break the crawling process
if self.logger:
self.logger.warning(
message="Stealth measures failed, continuing without stealth: {error}",
tag="STEALTH",
params={"error": str(e)}
)
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
""" """
Wait for a condition in a smart way. This functions works as below: Wait for a condition in a smart way. This functions works as below:
@@ -533,10 +720,23 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Get page for session # Get page for session
page, context = await self.browser_manager.get_page(crawlerRunConfig=config) page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
# Apply stealth measures if enabled # Apply stealth measures automatically (backward compatible) with optional config
if self.browser_config.stealth: # Check multiple possible locations for stealth config for flexibility
stealth = Stealth() stealth_config = None
await stealth.apply_stealth_async(page) if hasattr(config, 'stealth_config') and config.stealth_config:
stealth_config = config.stealth_config
elif hasattr(config, 'stealth') and config.stealth:
# Alternative attribute name for backward compatibility
stealth_config = config.stealth if isinstance(config.stealth, StealthConfig) else StealthConfig.from_dict(config.stealth)
elif config.magic:
# Enable more aggressive stealth in magic mode
stealth_config = StealthConfig(
navigator_webdriver=False, # More aggressive stealth
webdriver=False,
chrome_app=False
)
await self._apply_stealth(page, stealth_config)
# await page.goto(URL) # await page.goto(URL)
@@ -939,7 +1139,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
tag="VIEWPORT", tag="VIEWPORT",
params={"error": str(e)}, params={"error": str(e)},
) )
# Handle full page scanning # Handle full page scanning
if config.scan_full_page: if config.scan_full_page:
# await self._handle_full_page_scan(page, config.scroll_delay) # await self._handle_full_page_scan(page, config.scroll_delay)
@@ -1843,8 +2042,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# }} # }}
# }})(); # }})();
# """ # """
# )
# """ NEW VERSION: # """ NEW VERSION:
# When {script} contains statements (e.g., const link = …; link.click();), # When {script} contains statements (e.g., const link = …; link.click();),
# this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'. # this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'.

View File

@@ -1,75 +1,140 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Test suite for playwright-stealth v2.0.0+ compatibility fix. Test suite for playwright-stealth backward compatibility.
Tests the stealth implementation update from deprecated stealth_async to Stealth class. Tests that stealth functionality works automatically without user configuration.
""" """
import pytest import pytest
from unittest.mock import Mock, patch import asyncio
from unittest.mock import Mock, patch, MagicMock
class TestPlaywrightStealthCompatibility: class TestPlaywrightStealthCompatibility:
"""Test playwright-stealth v2.0.0+ compatibility fix""" """Test playwright-stealth backward compatibility with transparent operation"""
def test_api_detection_works(self):
"""Test that API detection works correctly"""
from crawl4ai.async_crawler_strategy import STEALTH_NEW_API
# The value depends on which version is installed, but should not be undefined
assert STEALTH_NEW_API is not None or STEALTH_NEW_API is False or STEALTH_NEW_API is None
@pytest.mark.asyncio
@patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', True)
@patch('crawl4ai.async_crawler_strategy.Stealth') @patch('crawl4ai.async_crawler_strategy.Stealth')
def test_stealth_import_works(self, mock_stealth_class): async def test_apply_stealth_new_api(self, mock_stealth_class):
"""Test that Stealth class can be imported successfully""" """Test stealth application with new API works transparently"""
from crawl4ai.async_crawler_strategy import Stealth from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Should not raise ImportError # Setup mock
assert Stealth is not None
assert mock_stealth_class.called is False # Just checking import, not instantiation
@patch('crawl4ai.async_crawler_strategy.Stealth')
def test_stealth_instantiation_works(self, mock_stealth_class):
"""Test that Stealth class can be instantiated"""
from crawl4ai.async_crawler_strategy import Stealth
# Create a mock instance
mock_stealth_instance = Mock()
mock_stealth_class.return_value = mock_stealth_instance
# This should work without errors
stealth = Stealth()
assert stealth is not None
mock_stealth_class.assert_called_once()
@patch('crawl4ai.async_crawler_strategy.Stealth')
def test_stealth_has_apply_method(self, mock_stealth_class):
"""Test that Stealth instance has apply_stealth_async method"""
from crawl4ai.async_crawler_strategy import Stealth
# Create a mock instance with apply_stealth_async method
mock_stealth_instance = Mock() mock_stealth_instance = Mock()
mock_stealth_instance.apply_stealth_async = Mock() mock_stealth_instance.apply_stealth_async = Mock()
mock_stealth_class.return_value = mock_stealth_instance mock_stealth_class.return_value = mock_stealth_instance
stealth = Stealth() # Create strategy instance
assert hasattr(stealth, 'apply_stealth_async') strategy = AsyncPlaywrightCrawlerStrategy()
assert callable(stealth.apply_stealth_async)
def test_browser_config_has_stealth_flag(self): # Mock page
"""Test that BrowserConfig has stealth flag""" mock_page = Mock()
# Test the method - should work transparently
await strategy._apply_stealth(mock_page)
# Verify new API was used
mock_stealth_class.assert_called_once()
mock_stealth_instance.apply_stealth_async.assert_called_once_with(mock_page)
@pytest.mark.asyncio
@patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', False)
async def test_apply_stealth_legacy_api(self):
"""Test stealth application with legacy API works transparently"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Mock stealth_async function by setting it as a module attribute
mock_stealth_async = Mock()
mock_stealth_async.return_value = None
# Import the module to add the mock function
import crawl4ai.async_crawler_strategy
crawl4ai.async_crawler_strategy.stealth_async = mock_stealth_async
try:
# Create strategy instance
strategy = AsyncPlaywrightCrawlerStrategy()
# Mock page
mock_page = Mock()
# Test the method - should work transparently
await strategy._apply_stealth(mock_page)
# Verify legacy API was used
mock_stealth_async.assert_called_once_with(mock_page)
finally:
# Clean up
if hasattr(crawl4ai.async_crawler_strategy, 'stealth_async'):
delattr(crawl4ai.async_crawler_strategy, 'stealth_async')
@pytest.mark.asyncio
@patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', None)
async def test_apply_stealth_no_library(self):
"""Test stealth application when no stealth library is available"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Create strategy instance
strategy = AsyncPlaywrightCrawlerStrategy()
# Mock page
mock_page = Mock()
# Test the method - should work transparently even without stealth
await strategy._apply_stealth(mock_page)
# Should complete without error even when no stealth is available
@pytest.mark.asyncio
@patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', True)
@patch('crawl4ai.async_crawler_strategy.Stealth')
async def test_stealth_error_handling(self, mock_stealth_class):
"""Test that stealth errors are handled gracefully without breaking crawling"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Setup mock to raise an error
mock_stealth_instance = Mock()
mock_stealth_instance.apply_stealth_async = Mock(side_effect=Exception("Stealth failed"))
mock_stealth_class.return_value = mock_stealth_instance
# Create strategy instance
strategy = AsyncPlaywrightCrawlerStrategy()
# Mock page
mock_page = Mock()
# Test the method - should not raise an error, continue silently
await strategy._apply_stealth(mock_page)
# Should complete without raising the stealth error
def test_strategy_creation_without_config(self):
"""Test that strategy can be created without any stealth configuration"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Should work without any stealth-related parameters
strategy = AsyncPlaywrightCrawlerStrategy()
assert strategy is not None
assert hasattr(strategy, '_apply_stealth')
def test_browser_config_works_without_stealth_param(self):
"""Test that BrowserConfig works without stealth parameter"""
from crawl4ai.async_configs import BrowserConfig from crawl4ai.async_configs import BrowserConfig
# Test default value # Should work without stealth parameter
config = BrowserConfig() config = BrowserConfig()
assert hasattr(config, 'stealth') assert config is not None
assert config.stealth is True # Default should be True
# Test explicit setting # Should also work with other parameters
config_disabled = BrowserConfig(stealth=False) config = BrowserConfig(headless=False, browser_type="firefox")
assert config_disabled.stealth is False assert config.headless == False
assert config.browser_type == "firefox"
def test_stealth_flag_serialization(self):
"""Test that stealth flag is properly serialized in BrowserConfig"""
from crawl4ai.async_configs import BrowserConfig
config = BrowserConfig(stealth=True)
config_dict = config.to_dict()
assert 'stealth' in config_dict
assert config_dict['stealth'] is True
if __name__ == "__main__": if __name__ == "__main__":