feat: Enhance stealth compatibility with new and legacy APIs, add configuration support

This commit is contained in:
AHMET YILMAZ
2025-07-16 17:41:47 +08:00
parent 5c13baf574
commit 65902a4773
3 changed files with 322 additions and 64 deletions

View File

@@ -389,7 +389,6 @@ class BrowserConfig:
self,
browser_type: str = "chromium",
headless: bool = True,
stealth: bool = True,
browser_mode: str = "dedicated",
use_managed_browser: bool = False,
cdp_url: str = None,
@@ -427,7 +426,6 @@ class BrowserConfig:
):
self.browser_type = browser_type
self.headless = headless
self.stealth = stealth
self.browser_mode = browser_mode
self.use_managed_browser = use_managed_browser
self.cdp_url = cdp_url
@@ -502,7 +500,6 @@ class BrowserConfig:
return BrowserConfig(
browser_type=kwargs.get("browser_type", "chromium"),
headless=kwargs.get("headless", True),
stealth=kwargs.get("stealth", True),
browser_mode=kwargs.get("browser_mode", "dedicated"),
use_managed_browser=kwargs.get("use_managed_browser", False),
cdp_url=kwargs.get("cdp_url"),
@@ -539,7 +536,6 @@ class BrowserConfig:
result = {
"browser_type": self.browser_type,
"headless": self.headless,
"stealth": self.stealth,
"browser_mode": self.browser_mode,
"use_managed_browser": self.use_managed_browser,
"cdp_url": self.cdp_url,

View File

@@ -6,13 +6,26 @@ import time
from abc import ABC, abstractmethod
from typing import Callable, Dict, Any, List, Union
from typing import Optional, AsyncGenerator, Final
from playwright_stealth import Stealth
import os
from playwright.async_api import Page, Error
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont
import hashlib
# Backward compatible stealth import
try:
# Try new tf-playwright-stealth API (Stealth class)
from playwright_stealth import Stealth
STEALTH_NEW_API = True
except ImportError:
try:
# Try old playwright-stealth API (stealth_async function)
from playwright_stealth import stealth_async
STEALTH_NEW_API = False
except ImportError:
# No stealth available
STEALTH_NEW_API = None
import uuid
from .js_snippet import load_js_script
from .models import AsyncCrawlResponse
@@ -32,6 +45,107 @@ from types import MappingProxyType
import contextlib
from functools import partial
# Add StealthConfig class for backward compatibility and new features
class StealthConfig:
"""
Configuration class for stealth settings that works with tf-playwright-stealth.
This maintains backward compatibility while supporting all tf-playwright-stealth features.
"""
def __init__(
self,
# Common settings
enabled: bool = True,
# Core tf-playwright-stealth parameters (matching the actual library)
chrome_app: bool = True,
chrome_csi: bool = True,
chrome_load_times: bool = True,
chrome_runtime: bool = False, # Note: library default is False
hairline: bool = True,
iframe_content_window: bool = True,
media_codecs: bool = True,
navigator_hardware_concurrency: bool = True,
navigator_languages: bool = True,
navigator_permissions: bool = True,
navigator_platform: bool = True,
navigator_plugins: bool = True,
navigator_user_agent: bool = True,
navigator_vendor: bool = True,
navigator_webdriver: bool = True,
sec_ch_ua: bool = True,
webgl_vendor: bool = True,
# Override parameters
navigator_languages_override: tuple = ("en-US", "en"),
navigator_platform_override: str = "Win32",
navigator_user_agent_override: str = None,
navigator_vendor_override: str = None,
sec_ch_ua_override: str = None,
webgl_renderer_override: str = None,
webgl_vendor_override: str = None,
# Advanced parameters
init_scripts_only: bool = False,
script_logging: bool = False,
# Legacy parameters for backward compatibility
webdriver: bool = None, # This will be mapped to navigator_webdriver
user_agent_override: bool = None, # This will be mapped to navigator_user_agent
window_outerdimensions: bool = None, # This parameter doesn't exist in tf-playwright-stealth
):
self.enabled = enabled
# Handle legacy parameter mapping for backward compatibility
if webdriver is not None:
navigator_webdriver = webdriver
if user_agent_override is not None:
navigator_user_agent = user_agent_override
# Store all stealth options for the Stealth class - filter out None values
self.stealth_options = {
k: v for k, v in {
'chrome_app': chrome_app,
'chrome_csi': chrome_csi,
'chrome_load_times': chrome_load_times,
'chrome_runtime': chrome_runtime,
'hairline': hairline,
'iframe_content_window': iframe_content_window,
'media_codecs': media_codecs,
'navigator_hardware_concurrency': navigator_hardware_concurrency,
'navigator_languages': navigator_languages,
'navigator_permissions': navigator_permissions,
'navigator_platform': navigator_platform,
'navigator_plugins': navigator_plugins,
'navigator_user_agent': navigator_user_agent,
'navigator_vendor': navigator_vendor,
'navigator_webdriver': navigator_webdriver,
'sec_ch_ua': sec_ch_ua,
'webgl_vendor': webgl_vendor,
'navigator_languages_override': navigator_languages_override,
'navigator_platform_override': navigator_platform_override,
'navigator_user_agent_override': navigator_user_agent_override,
'navigator_vendor_override': navigator_vendor_override,
'sec_ch_ua_override': sec_ch_ua_override,
'webgl_renderer_override': webgl_renderer_override,
'webgl_vendor_override': webgl_vendor_override,
'init_scripts_only': init_scripts_only,
'script_logging': script_logging,
}.items() if v is not None
}
@classmethod
def from_dict(cls, config_dict: dict) -> 'StealthConfig':
"""Create StealthConfig from dictionary for easy configuration"""
return cls(**config_dict)
def to_dict(self) -> dict:
"""Convert to dictionary for serialization"""
return {
'enabled': self.enabled,
**self.stealth_options
}
class AsyncCrawlerStrategy(ABC):
"""
Abstract base class for crawler strategies.
@@ -40,7 +154,7 @@ class AsyncCrawlerStrategy(ABC):
@abstractmethod
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
pass # 4 + 3
pass # 4 + 3
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"""
@@ -221,6 +335,79 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"""
self.headers = headers
async def _apply_stealth(self, page: Page, stealth_config: Optional[StealthConfig] = None):
"""
Apply stealth measures to the page with backward compatibility and enhanced configuration.
This method automatically applies stealth measures and now supports configuration
through StealthConfig while maintaining backward compatibility.
Currently supports:
- tf-playwright-stealth (Stealth class with extensive configuration)
- Old playwright-stealth v1.x (stealth_async function) - legacy support
Args:
page (Page): The Playwright page object
stealth_config (Optional[StealthConfig]): Configuration for stealth settings
"""
if STEALTH_NEW_API is None:
# No stealth library available - silently continue
if self.logger and hasattr(self.logger, 'debug'):
self.logger.debug(
message="playwright-stealth not available, skipping stealth measures",
tag="STEALTH"
)
return
# Use default config if none provided
if stealth_config is None:
stealth_config = StealthConfig()
# Skip if stealth is disabled
if not stealth_config.enabled:
if self.logger and hasattr(self.logger, 'debug'):
self.logger.debug(
message="Stealth measures disabled in configuration",
tag="STEALTH"
)
return
try:
if STEALTH_NEW_API:
# Use tf-playwright-stealth API with configuration support
# Filter out any invalid parameters that might cause issues
valid_options = {}
for key, value in stealth_config.stealth_options.items():
# Accept boolean parameters and specific string/tuple parameters
if isinstance(value, (bool, str, tuple)):
valid_options[key] = value
stealth = Stealth(**valid_options)
await stealth.apply_stealth_async(page)
config_info = f"with {len(valid_options)} options"
else:
# Use old API (v1.x) - configuration options are limited
await stealth_async(page)
config_info = "default (v1.x legacy)"
# Only log if logger is available and in debug mode
if self.logger and hasattr(self.logger, 'debug'):
api_version = "tf-playwright-stealth" if STEALTH_NEW_API else "v1.x"
self.logger.debug(
message="Applied stealth measures using {version} {config}",
tag="STEALTH",
params={"version": api_version, "config": config_info}
)
except Exception as e:
# Silently continue if stealth fails - don't break the crawling process
if self.logger:
self.logger.warning(
message="Stealth measures failed, continuing without stealth: {error}",
tag="STEALTH",
params={"error": str(e)}
)
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
"""
Wait for a condition in a smart way. This functions works as below:
@@ -533,10 +720,23 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Get page for session
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
# Apply stealth measures if enabled
if self.browser_config.stealth:
stealth = Stealth()
await stealth.apply_stealth_async(page)
# Apply stealth measures automatically (backward compatible) with optional config
# Check multiple possible locations for stealth config for flexibility
stealth_config = None
if hasattr(config, 'stealth_config') and config.stealth_config:
stealth_config = config.stealth_config
elif hasattr(config, 'stealth') and config.stealth:
# Alternative attribute name for backward compatibility
stealth_config = config.stealth if isinstance(config.stealth, StealthConfig) else StealthConfig.from_dict(config.stealth)
elif config.magic:
# Enable more aggressive stealth in magic mode
stealth_config = StealthConfig(
navigator_webdriver=False, # More aggressive stealth
webdriver=False,
chrome_app=False
)
await self._apply_stealth(page, stealth_config)
# await page.goto(URL)
@@ -939,7 +1139,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
tag="VIEWPORT",
params={"error": str(e)},
)
# Handle full page scanning
if config.scan_full_page:
# await self._handle_full_page_scan(page, config.scroll_delay)
@@ -1843,8 +2042,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# }}
# }})();
# """
# )
# """ NEW VERSION:
# When {script} contains statements (e.g., const link = …; link.click();),
# this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'.

View File

@@ -1,75 +1,140 @@
#!/usr/bin/env python3
"""
Test suite for playwright-stealth v2.0.0+ compatibility fix.
Tests the stealth implementation update from deprecated stealth_async to Stealth class.
Test suite for playwright-stealth backward compatibility.
Tests that stealth functionality works automatically without user configuration.
"""
import pytest
from unittest.mock import Mock, patch
import asyncio
from unittest.mock import Mock, patch, MagicMock
class TestPlaywrightStealthCompatibility:
"""Test playwright-stealth v2.0.0+ compatibility fix"""
"""Test playwright-stealth backward compatibility with transparent operation"""
def test_api_detection_works(self):
"""Test that API detection works correctly"""
from crawl4ai.async_crawler_strategy import STEALTH_NEW_API
# The value depends on which version is installed, but should not be undefined
assert STEALTH_NEW_API is not None or STEALTH_NEW_API is False or STEALTH_NEW_API is None
@pytest.mark.asyncio
@patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', True)
@patch('crawl4ai.async_crawler_strategy.Stealth')
def test_stealth_import_works(self, mock_stealth_class):
"""Test that Stealth class can be imported successfully"""
from crawl4ai.async_crawler_strategy import Stealth
async def test_apply_stealth_new_api(self, mock_stealth_class):
"""Test stealth application with new API works transparently"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Should not raise ImportError
assert Stealth is not None
assert mock_stealth_class.called is False # Just checking import, not instantiation
@patch('crawl4ai.async_crawler_strategy.Stealth')
def test_stealth_instantiation_works(self, mock_stealth_class):
"""Test that Stealth class can be instantiated"""
from crawl4ai.async_crawler_strategy import Stealth
# Create a mock instance
mock_stealth_instance = Mock()
mock_stealth_class.return_value = mock_stealth_instance
# This should work without errors
stealth = Stealth()
assert stealth is not None
mock_stealth_class.assert_called_once()
@patch('crawl4ai.async_crawler_strategy.Stealth')
def test_stealth_has_apply_method(self, mock_stealth_class):
"""Test that Stealth instance has apply_stealth_async method"""
from crawl4ai.async_crawler_strategy import Stealth
# Create a mock instance with apply_stealth_async method
# Setup mock
mock_stealth_instance = Mock()
mock_stealth_instance.apply_stealth_async = Mock()
mock_stealth_class.return_value = mock_stealth_instance
stealth = Stealth()
assert hasattr(stealth, 'apply_stealth_async')
assert callable(stealth.apply_stealth_async)
# Create strategy instance
strategy = AsyncPlaywrightCrawlerStrategy()
def test_browser_config_has_stealth_flag(self):
"""Test that BrowserConfig has stealth flag"""
# Mock page
mock_page = Mock()
# Test the method - should work transparently
await strategy._apply_stealth(mock_page)
# Verify new API was used
mock_stealth_class.assert_called_once()
mock_stealth_instance.apply_stealth_async.assert_called_once_with(mock_page)
@pytest.mark.asyncio
@patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', False)
async def test_apply_stealth_legacy_api(self):
"""Test stealth application with legacy API works transparently"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Mock stealth_async function by setting it as a module attribute
mock_stealth_async = Mock()
mock_stealth_async.return_value = None
# Import the module to add the mock function
import crawl4ai.async_crawler_strategy
crawl4ai.async_crawler_strategy.stealth_async = mock_stealth_async
try:
# Create strategy instance
strategy = AsyncPlaywrightCrawlerStrategy()
# Mock page
mock_page = Mock()
# Test the method - should work transparently
await strategy._apply_stealth(mock_page)
# Verify legacy API was used
mock_stealth_async.assert_called_once_with(mock_page)
finally:
# Clean up
if hasattr(crawl4ai.async_crawler_strategy, 'stealth_async'):
delattr(crawl4ai.async_crawler_strategy, 'stealth_async')
@pytest.mark.asyncio
@patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', None)
async def test_apply_stealth_no_library(self):
"""Test stealth application when no stealth library is available"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Create strategy instance
strategy = AsyncPlaywrightCrawlerStrategy()
# Mock page
mock_page = Mock()
# Test the method - should work transparently even without stealth
await strategy._apply_stealth(mock_page)
# Should complete without error even when no stealth is available
@pytest.mark.asyncio
@patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', True)
@patch('crawl4ai.async_crawler_strategy.Stealth')
async def test_stealth_error_handling(self, mock_stealth_class):
"""Test that stealth errors are handled gracefully without breaking crawling"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Setup mock to raise an error
mock_stealth_instance = Mock()
mock_stealth_instance.apply_stealth_async = Mock(side_effect=Exception("Stealth failed"))
mock_stealth_class.return_value = mock_stealth_instance
# Create strategy instance
strategy = AsyncPlaywrightCrawlerStrategy()
# Mock page
mock_page = Mock()
# Test the method - should not raise an error, continue silently
await strategy._apply_stealth(mock_page)
# Should complete without raising the stealth error
def test_strategy_creation_without_config(self):
"""Test that strategy can be created without any stealth configuration"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Should work without any stealth-related parameters
strategy = AsyncPlaywrightCrawlerStrategy()
assert strategy is not None
assert hasattr(strategy, '_apply_stealth')
def test_browser_config_works_without_stealth_param(self):
"""Test that BrowserConfig works without stealth parameter"""
from crawl4ai.async_configs import BrowserConfig
# Test default value
# Should work without stealth parameter
config = BrowserConfig()
assert hasattr(config, 'stealth')
assert config.stealth is True # Default should be True
assert config is not None
# Test explicit setting
config_disabled = BrowserConfig(stealth=False)
assert config_disabled.stealth is False
def test_stealth_flag_serialization(self):
"""Test that stealth flag is properly serialized in BrowserConfig"""
from crawl4ai.async_configs import BrowserConfig
config = BrowserConfig(stealth=True)
config_dict = config.to_dict()
assert 'stealth' in config_dict
assert config_dict['stealth'] is True
# Should also work with other parameters
config = BrowserConfig(headless=False, browser_type="firefox")
assert config.headless == False
assert config.browser_type == "firefox"
if __name__ == "__main__":