Compare commits

..

4 Commits

Author SHA1 Message Date
AHMET YILMAZ
65902a4773 feat: Enhance stealth compatibility with new and legacy APIs, add configuration support 2025-07-16 17:41:47 +08:00
AHMET YILMAZ
5c13baf574 feat: Add stealth option to BrowserConfig for enhanced browser behavior 2025-07-15 15:48:23 +08:00
AHMET YILMAZ
d2759824ef fix: Update playwright-stealth to v2.0.0+ compatibility
Fixes #1273

- Replace deprecated stealth_async import with Stealth class
- Add stealth flag to BrowserConfig (default: true)
- Update async_crawler_strategy to use Stealth().apply_stealth_async()
- Remove obsolete StealthConfig from browser_manager
- Maintain backward compatibility with existing stealth functionality

This fixes compatibility issues with playwright-stealth v2.0.0+ where the API changed from stealth_async function to Stealth class.

test: Add comprehensive tests for playwright-stealth v2.0.0+ compatibility

- Test Stealth class import and instantiation
- Test apply_stealth_async method availability
- Test BrowserConfig stealth flag functionality
- Test stealth flag serialization
- Verify backward compatibility with existing stealth functionality
2025-07-15 15:31:15 +08:00
UncleCode
bde1bba6a2 docs: Add missing documentation pages to mkdocs.yml
- Added Adaptive Crawling to Core section
- Added URL Seeding to Core section
- Added Adaptive Strategies to Advanced section
2025-07-12 19:56:33 +08:00
10 changed files with 668 additions and 278 deletions

View File

@@ -523,18 +523,15 @@ async def test_news_crawl():
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
```python
config = AdaptiveConfig(
confidence_threshold=0.7, # Min confidence to stop crawling
max_depth=5, # Maximum crawl depth
max_pages=20, # Maximum number of pages to crawl
strategy="statistical"
confidence_threshold=0.7,
max_history=100,
learning_rate=0.2
)
async with AsyncWebCrawler() as crawler:
adaptive_crawler = AdaptiveCrawler(crawler, config)
state = await adaptive_crawler.digest(
start_url="https://news.example.com",
query="latest news content"
)
result = await crawler.arun(
"https://news.example.com",
config=CrawlerRunConfig(adaptive_config=config)
)
# Crawler learns patterns and improves extraction over time
```

View File

@@ -12,6 +12,20 @@ from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont
import hashlib
# Backward compatible stealth import
try:
# Try new tf-playwright-stealth API (Stealth class)
from playwright_stealth import Stealth
STEALTH_NEW_API = True
except ImportError:
try:
# Try old playwright-stealth API (stealth_async function)
from playwright_stealth import stealth_async
STEALTH_NEW_API = False
except ImportError:
# No stealth available
STEALTH_NEW_API = None
import uuid
from .js_snippet import load_js_script
from .models import AsyncCrawlResponse
@@ -31,6 +45,107 @@ from types import MappingProxyType
import contextlib
from functools import partial
# Add StealthConfig class for backward compatibility and new features
class StealthConfig:
"""
Configuration class for stealth settings that works with tf-playwright-stealth.
This maintains backward compatibility while supporting all tf-playwright-stealth features.
"""
def __init__(
self,
# Common settings
enabled: bool = True,
# Core tf-playwright-stealth parameters (matching the actual library)
chrome_app: bool = True,
chrome_csi: bool = True,
chrome_load_times: bool = True,
chrome_runtime: bool = False, # Note: library default is False
hairline: bool = True,
iframe_content_window: bool = True,
media_codecs: bool = True,
navigator_hardware_concurrency: bool = True,
navigator_languages: bool = True,
navigator_permissions: bool = True,
navigator_platform: bool = True,
navigator_plugins: bool = True,
navigator_user_agent: bool = True,
navigator_vendor: bool = True,
navigator_webdriver: bool = True,
sec_ch_ua: bool = True,
webgl_vendor: bool = True,
# Override parameters
navigator_languages_override: tuple = ("en-US", "en"),
navigator_platform_override: str = "Win32",
navigator_user_agent_override: str = None,
navigator_vendor_override: str = None,
sec_ch_ua_override: str = None,
webgl_renderer_override: str = None,
webgl_vendor_override: str = None,
# Advanced parameters
init_scripts_only: bool = False,
script_logging: bool = False,
# Legacy parameters for backward compatibility
webdriver: bool = None, # This will be mapped to navigator_webdriver
user_agent_override: bool = None, # This will be mapped to navigator_user_agent
window_outerdimensions: bool = None, # This parameter doesn't exist in tf-playwright-stealth
):
self.enabled = enabled
# Handle legacy parameter mapping for backward compatibility
if webdriver is not None:
navigator_webdriver = webdriver
if user_agent_override is not None:
navigator_user_agent = user_agent_override
# Store all stealth options for the Stealth class - filter out None values
self.stealth_options = {
k: v for k, v in {
'chrome_app': chrome_app,
'chrome_csi': chrome_csi,
'chrome_load_times': chrome_load_times,
'chrome_runtime': chrome_runtime,
'hairline': hairline,
'iframe_content_window': iframe_content_window,
'media_codecs': media_codecs,
'navigator_hardware_concurrency': navigator_hardware_concurrency,
'navigator_languages': navigator_languages,
'navigator_permissions': navigator_permissions,
'navigator_platform': navigator_platform,
'navigator_plugins': navigator_plugins,
'navigator_user_agent': navigator_user_agent,
'navigator_vendor': navigator_vendor,
'navigator_webdriver': navigator_webdriver,
'sec_ch_ua': sec_ch_ua,
'webgl_vendor': webgl_vendor,
'navigator_languages_override': navigator_languages_override,
'navigator_platform_override': navigator_platform_override,
'navigator_user_agent_override': navigator_user_agent_override,
'navigator_vendor_override': navigator_vendor_override,
'sec_ch_ua_override': sec_ch_ua_override,
'webgl_renderer_override': webgl_renderer_override,
'webgl_vendor_override': webgl_vendor_override,
'init_scripts_only': init_scripts_only,
'script_logging': script_logging,
}.items() if v is not None
}
@classmethod
def from_dict(cls, config_dict: dict) -> 'StealthConfig':
"""Create StealthConfig from dictionary for easy configuration"""
return cls(**config_dict)
def to_dict(self) -> dict:
"""Convert to dictionary for serialization"""
return {
'enabled': self.enabled,
**self.stealth_options
}
class AsyncCrawlerStrategy(ABC):
"""
Abstract base class for crawler strategies.
@@ -39,7 +154,7 @@ class AsyncCrawlerStrategy(ABC):
@abstractmethod
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
pass # 4 + 3
pass # 4 + 3
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"""
@@ -220,6 +335,79 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"""
self.headers = headers
async def _apply_stealth(self, page: Page, stealth_config: Optional[StealthConfig] = None):
"""
Apply stealth measures to the page with backward compatibility and enhanced configuration.
This method automatically applies stealth measures and now supports configuration
through StealthConfig while maintaining backward compatibility.
Currently supports:
- tf-playwright-stealth (Stealth class with extensive configuration)
- Old playwright-stealth v1.x (stealth_async function) - legacy support
Args:
page (Page): The Playwright page object
stealth_config (Optional[StealthConfig]): Configuration for stealth settings
"""
if STEALTH_NEW_API is None:
# No stealth library available - silently continue
if self.logger and hasattr(self.logger, 'debug'):
self.logger.debug(
message="playwright-stealth not available, skipping stealth measures",
tag="STEALTH"
)
return
# Use default config if none provided
if stealth_config is None:
stealth_config = StealthConfig()
# Skip if stealth is disabled
if not stealth_config.enabled:
if self.logger and hasattr(self.logger, 'debug'):
self.logger.debug(
message="Stealth measures disabled in configuration",
tag="STEALTH"
)
return
try:
if STEALTH_NEW_API:
# Use tf-playwright-stealth API with configuration support
# Filter out any invalid parameters that might cause issues
valid_options = {}
for key, value in stealth_config.stealth_options.items():
# Accept boolean parameters and specific string/tuple parameters
if isinstance(value, (bool, str, tuple)):
valid_options[key] = value
stealth = Stealth(**valid_options)
await stealth.apply_stealth_async(page)
config_info = f"with {len(valid_options)} options"
else:
# Use old API (v1.x) - configuration options are limited
await stealth_async(page)
config_info = "default (v1.x legacy)"
# Only log if logger is available and in debug mode
if self.logger and hasattr(self.logger, 'debug'):
api_version = "tf-playwright-stealth" if STEALTH_NEW_API else "v1.x"
self.logger.debug(
message="Applied stealth measures using {version} {config}",
tag="STEALTH",
params={"version": api_version, "config": config_info}
)
except Exception as e:
# Silently continue if stealth fails - don't break the crawling process
if self.logger:
self.logger.warning(
message="Stealth measures failed, continuing without stealth: {error}",
tag="STEALTH",
params={"error": str(e)}
)
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
"""
Wait for a condition in a smart way. This functions works as below:
@@ -532,6 +720,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Get page for session
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
# Apply stealth measures automatically (backward compatible) with optional config
# Check multiple possible locations for stealth config for flexibility
stealth_config = None
if hasattr(config, 'stealth_config') and config.stealth_config:
stealth_config = config.stealth_config
elif hasattr(config, 'stealth') and config.stealth:
# Alternative attribute name for backward compatibility
stealth_config = config.stealth if isinstance(config.stealth, StealthConfig) else StealthConfig.from_dict(config.stealth)
elif config.magic:
# Enable more aggressive stealth in magic mode
stealth_config = StealthConfig(
navigator_webdriver=False, # More aggressive stealth
webdriver=False,
chrome_app=False
)
await self._apply_stealth(page, stealth_config)
# await page.goto(URL)
# Add default cookie
@@ -933,7 +1139,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
tag="VIEWPORT",
params={"error": str(e)},
)
# Handle full page scanning
if config.scan_full_page:
# await self._handle_full_page_scan(page, config.scroll_delay)
@@ -1837,8 +2042,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# }}
# }})();
# """
# )
# """ NEW VERSION:
# When {script} contains statements (e.g., const link = …; link.click();),
# this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'.

View File

@@ -14,24 +14,8 @@ import hashlib
from .js_snippet import load_js_script
from .config import DOWNLOAD_PAGE_TIMEOUT
from .async_configs import BrowserConfig, CrawlerRunConfig
from playwright_stealth import StealthConfig
from .utils import get_chromium_path
stealth_config = StealthConfig(
webdriver=True,
chrome_app=True,
chrome_csi=True,
chrome_load_times=True,
chrome_runtime=True,
navigator_languages=True,
navigator_plugins=True,
navigator_permissions=True,
webgl_vendor=True,
outerdimensions=True,
navigator_hardware_concurrency=True,
media_codecs=True,
)
BROWSER_DISABLE_OPTIONS = [
"--disable-background-networking",
"--disable-background-timer-throttling",

View File

@@ -10,8 +10,9 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
- **PDF Parsing**: Extract data from PDF documents
- **Performance Optimizations**: Significant speed and memory improvements
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
@@ -29,34 +30,44 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
- Extraction confidence scores
```python
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
# Initialize with custom adaptive parameters
# Initialize with custom learning parameters
config = AdaptiveConfig(
confidence_threshold=0.7, # Min confidence to stop crawling
max_depth=5, # Maximum crawl depth
max_pages=20, # Maximum number of pages to crawl
top_k_links=3, # Number of top links to follow per page
strategy="statistical", # 'statistical' or 'embedding'
coverage_weight=0.4, # Weight for coverage in confidence calculation
consistency_weight=0.3, # Weight for consistency in confidence calculation
saturation_weight=0.3 # Weight for saturation in confidence calculation
confidence_threshold=0.7, # Min confidence to use learned patterns
max_history=100, # Remember last 100 crawls per domain
learning_rate=0.2, # How quickly to adapt to changes
patterns_per_page=3, # Patterns to learn per page type
extraction_strategy='css' # 'css' or 'xpath'
)
# Initialize adaptive crawler with web crawler
adaptive_crawler = AdaptiveCrawler(config)
# First crawl - crawler learns the structure
async with AsyncWebCrawler() as crawler:
adaptive_crawler = AdaptiveCrawler(crawler, config)
# Crawl and learn patterns
state = await adaptive_crawler.digest(
start_url="https://news.example.com/article/12345",
query="latest news articles and content"
result = await crawler.arun(
"https://news.example.com/article/12345",
config=CrawlerRunConfig(
adaptive_config=config,
extraction_hints={ # Optional hints to speed up learning
"title": "article h1",
"content": "article .body-content"
}
)
)
# Access results and confidence
print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
print(f"Pages Crawled: {len(state.crawled_urls)}")
print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
# Crawler identifies and stores patterns
if result.success:
state = adaptive_crawler.get_state("news.example.com")
print(f"Learned {len(state.patterns)} patterns")
print(f"Confidence: {state.avg_confidence:.2%}")
# Subsequent crawls - uses learned patterns
result2 = await crawler.arun(
"https://news.example.com/article/67890",
config=CrawlerRunConfig(adaptive_config=config)
)
# Automatically extracts using learned patterns!
```
**Expected Real-World Impact:**
@@ -81,7 +92,9 @@ twitter_config = VirtualScrollConfig(
container_selector="[data-testid='primaryColumn']",
scroll_count=20, # Number of scrolls
scroll_by="container_height", # Smart scrolling by container size
wait_after_scroll=1.0 # Let content load
wait_after_scroll=1.0, # Let content load
capture_method="incremental", # Capture new content on each scroll
deduplicate=True # Remove duplicate elements
)
# For e-commerce product grids (Instagram style)
@@ -89,7 +102,8 @@ grid_config = VirtualScrollConfig(
container_selector="main .product-grid",
scroll_count=30,
scroll_by=800, # Fixed pixel scrolling
wait_after_scroll=1.5 # Images need time
wait_after_scroll=1.5, # Images need time
stop_on_no_change=True # Smart stopping
)
# For news feeds with lazy loading
@@ -97,7 +111,9 @@ news_config = VirtualScrollConfig(
container_selector=".article-feed",
scroll_count=50,
scroll_by="page_height", # Viewport-based scrolling
wait_after_scroll=0.5 # Wait for content to load
wait_after_scroll=0.5,
wait_for_selector=".article-card", # Wait for specific elements
timeout=30000 # Max 30 seconds total
)
# Use it in your crawl
@@ -144,17 +160,29 @@ async with AsyncWebCrawler() as crawler:
### The Three-Layer Scoring System
```python
from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
from crawl4ai import LinkPreviewConfig
# Configure intelligent link analysis
link_config = LinkPreviewConfig(
# What to analyze
include_internal=True,
include_external=False,
max_links=10,
concurrency=5,
query="python tutorial", # For contextual scoring
score_threshold=0.3,
verbose=True
include_external=True,
max_links=100, # Analyze top 100 links
# Relevance scoring
query="machine learning tutorials", # Your interest
score_threshold=0.3, # Minimum relevance score
# Performance
concurrent_requests=10, # Parallel processing
timeout_per_link=5000, # 5s per link
# Advanced scoring weights
scoring_weights={
"intrinsic": 0.3, # Link quality indicators
"contextual": 0.5, # Relevance to query
"popularity": 0.2 # Link prominence
}
)
# Use in your crawl
@@ -162,51 +190,35 @@ result = await crawler.arun(
"https://tech-blog.example.com",
config=CrawlerRunConfig(
link_preview_config=link_config,
score_links=True, # Enable intrinsic scoring
cache_mode=CacheMode.BYPASS
score_links=True
)
)
# Access scored and sorted links
if result.success and result.links:
# Get scored links
internal_links = result.links.get("internal", [])
scored_links = [l for l in internal_links if l.get("total_score")]
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
# Create a scoring table
table = Table(title="Link Scoring Results", box=box.ROUNDED)
table.add_column("Link Text", style="cyan", width=40)
table.add_column("Intrinsic Score", justify="center")
table.add_column("Contextual Score", justify="center")
table.add_column("Total Score", justify="center", style="bold green")
for link in scored_links[:5]:
text = link.get('text', 'No text')[:40]
table.add_row(
text,
f"{link.get('intrinsic_score', 0):.1f}/10",
f"{link.get('contextual_score', 0):.2f}/1",
f"{link.get('total_score', 0):.3f}"
)
console.print(table)
for link in result.links["internal"][:10]: # Top 10 internal links
print(f"Score: {link['total_score']:.3f}")
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query
print(f" URL: {link['href']}")
print(f" Title: {link['head_data']['title']}")
print(f" Description: {link['head_data']['meta']['description'][:100]}...")
```
**Scoring Components:**
1. **Intrinsic Score**: Based on link quality indicators
1. **Intrinsic Score (0-10)**: Based on link quality indicators
- Position on page (navigation, content, footer)
- Link attributes (rel, title, class names)
- Anchor text quality and length
- URL structure and depth
2. **Contextual Score**: Relevance to your query using BM25 algorithm
2. **Contextual Score (0-1)**: Relevance to your query
- Semantic similarity using embeddings
- Keyword matching in link text and title
- Meta description analysis
- Content preview scoring
3. **Total Score**: Combined score for final ranking
3. **Total Score**: Weighted combination for final ranking
**Expected Real-World Impact:**
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
@@ -228,53 +240,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig
# Basic discovery - find all product pages
seeder_config = SeedingConfig(
# Discovery sources
source="cc+sitemap", # Sitemap + Common Crawl
source="sitemap+cc", # Sitemap + Common Crawl
# Filtering
pattern="*/product/*", # URL pattern matching
ignore_patterns=["*/reviews/*", "*/questions/*"],
# Validation
live_check=True, # Verify URLs are alive
max_urls=50, # Stop at 50 URLs
max_urls=5000, # Stop at 5000 URLs
# Performance
concurrency=100, # Maximum concurrent requests for live checks/head extraction
hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers
concurrency=100, # Parallel requests
hits_per_sec=10 # Rate limiting
)
async with AsyncUrlSeeder() as seeder:
console.print("Discovering URLs from Python docs...")
urls = await seeder.urls("docs.python.org", seeding_config)
console.print(f"\n✓ Discovered {len(urls)} URLs")
seeder = AsyncUrlSeeder(seeder_config)
urls = await seeder.discover("https://shop.example.com")
# Advanced: Relevance-based discovery
research_config = SeedingConfig(
source="sitemap+cc", # Sitemap + Common Crawl
source="crawl+sitemap", # Deep crawl + sitemap
pattern="*/blog/*", # Blog posts only
# Content relevance
extract_head=True, # Get meta tags
query="quantum computing tutorials",
scoring_method="bm25", # BM25 scoring method
scoring_method="bm25", # Or "semantic" (coming soon)
score_threshold=0.4, # High relevance only
# Smart filtering
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
min_content_length=500, # Skip thin content
force=True # Bypass cache
)
# Discover with progress tracking
discovered = []
async with AsyncUrlSeeder() as seeder:
discovered = await seeder.urls("https://physics-blog.com", research_config)
console.print(f"\n✓ Discovered {len(discovered)} URLs")
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
discovered.extend(batch)
print(f"Found {len(discovered)} relevant URLs so far...")
# Results include scores and metadata
for url_data in discovered[:5]:
print(f"URL: {url_data['url']}")
print(f"Score: {url_data['relevance_score']:.3f}")
print(f"Title: {url_data['head_data']['title']}")
print(f"Score: {url_data['score']:.3f}")
print(f"Title: {url_data['title']}")
```
**Discovery Methods:**
@@ -297,18 +309,35 @@ This release includes significant performance improvements through optimized res
### What We Optimized
```python
# Optimized crawling with v0.7.0 improvements
# Before v0.7.0 (slow)
results = []
for url in urls:
result = await crawler.arun(
url,
config=CrawlerRunConfig(
# Performance optimizations
wait_until="domcontentloaded", # Faster than networkidle
cache_mode=CacheMode.ENABLED # Enable caching
)
)
result = await crawler.arun(url)
results.append(result)
# After v0.7.0 (fast)
# Automatic batching and connection pooling
results = await crawler.arun_batch(
urls,
config=CrawlerRunConfig(
# New performance options
batch_size=10, # Process 10 URLs concurrently
reuse_browser=True, # Keep browser warm
eager_loading=False, # Load only what's needed
streaming_extraction=True, # Stream large extractions
# Optimized defaults
wait_until="domcontentloaded", # Faster than networkidle
exclude_external_resources=True, # Skip third-party assets
block_ads=True # Ad blocking built-in
)
)
# Memory-efficient streaming for large crawls
async for result in crawler.arun_stream(large_url_list):
# Process results as they complete
await process_result(result)
# Memory is freed after each iteration
```
**Performance Gains:**
@@ -318,6 +347,24 @@ for url in urls:
- **Memory Usage**: 60% reduction with streaming processing
- **Concurrent Crawls**: Handle 5x more parallel requests
## 📄 PDF Support
PDF extraction is now natively supported in Crawl4AI.
```python
# Extract data from PDF documents
result = await crawler.arun(
"https://example.com/report.pdf",
config=CrawlerRunConfig(
pdf_extraction=True,
extraction_strategy=JsonCssExtractionStrategy({
# Works on converted PDF structure
"title": {"selector": "h1", "type": "text"},
"sections": {"selector": "h2", "type": "list"}
})
)
)
```
## 🔧 Important Changes

View File

@@ -49,75 +49,46 @@ from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.cache_context import CacheMode
async def crawl_dynamic_content():
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "wait_for_session"
all_commits = []
async with AsyncWebCrawler() as crawler:
session_id = "github_commits_session"
url = "https://github.com/microsoft/TypeScript/commits/main"
all_commits = []
js_next_page = """
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
if (commits.length > 0) {
window.lastCommit = commits[0].textContent.trim();
}
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) {button.click(); console.log('button clicked') }
"""
# Define extraction schema
schema = {
"name": "Commit Extractor",
"baseSelector": "li.Box-sc-g0xbh4-0",
"fields": [{
"name": "title", "selector": "h4.markdown-title", "type": "text"
}],
}
extraction_strategy = JsonCssExtractionStrategy(schema)
wait_for = """() => {
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
if (commits.length === 0) return false;
const firstCommit = commits[0].textContent.trim();
return firstCommit !== window.lastCommit;
}"""
schema = {
"name": "Commit Extractor",
"baseSelector": "li[data-testid='commit-row-item']",
"fields": [
{
"name": "title",
"selector": "h4 a",
"type": "text",
"transform": "strip",
},
],
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
browser_config = BrowserConfig(
verbose=True,
headless=False,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
# JavaScript and wait configurations
js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
# Crawl multiple pages
for page in range(3):
crawler_config = CrawlerRunConfig(
config = CrawlerRunConfig(
url=url,
session_id=session_id,
css_selector="li[data-testid='commit-row-item']",
extraction_strategy=extraction_strategy,
js_code=js_next_page if page > 0 else None,
wait_for=wait_for if page > 0 else None,
js_only=page > 0,
cache_mode=CacheMode.BYPASS,
capture_console_messages=True,
cache_mode=CacheMode.BYPASS
)
result = await crawler.arun(url=url, config=crawler_config)
if result.console_messages:
print(f"Page {page + 1} console messages:", result.console_messages)
if result.extracted_content:
# print(f"Page {page + 1} result:", result.extracted_content)
result = await crawler.arun(config=config)
if result.success:
commits = json.loads(result.extracted_content)
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
else:
print(f"Page {page + 1}: No content extracted")
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
# Clean up session
await crawler.crawler_strategy.kill_session(session_id)
return all_commits
```
---

View File

@@ -91,12 +91,13 @@ async def crawl_twitter_timeline():
wait_after_scroll=1.0 # Twitter needs time to load
)
browser_config = BrowserConfig(headless=True) # Set to False to watch it work
config = CrawlerRunConfig(
virtual_scroll_config=virtual_config
virtual_scroll_config=virtual_config,
# Optional: Set headless=False to watch it work
# browser_config=BrowserConfig(headless=False)
)
async with AsyncWebCrawler(config=browser_config) as crawler:
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://twitter.com/search?q=AI",
config=config
@@ -199,7 +200,7 @@ Use **scan_full_page** when:
Virtual Scroll works seamlessly with extraction strategies:
```python
from crawl4ai import LLMExtractionStrategy, LLMConfig
from crawl4ai import LLMExtractionStrategy
# Define extraction schema
schema = {
@@ -221,7 +222,7 @@ config = CrawlerRunConfig(
scroll_count=20
),
extraction_strategy=LLMExtractionStrategy(
llm_config=LLMConfig(provider="openai/gpt-4o-mini"),
provider="openai/gpt-4o-mini",
schema=schema
)
)

View File

@@ -10,8 +10,9 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
- **PDF Parsing**: Extract data from PDF documents
- **Performance Optimizations**: Significant speed and memory improvements
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
@@ -29,34 +30,44 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
- Extraction confidence scores
```python
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
# Initialize with custom adaptive parameters
# Initialize with custom learning parameters
config = AdaptiveConfig(
confidence_threshold=0.7, # Min confidence to stop crawling
max_depth=5, # Maximum crawl depth
max_pages=20, # Maximum number of pages to crawl
top_k_links=3, # Number of top links to follow per page
strategy="statistical", # 'statistical' or 'embedding'
coverage_weight=0.4, # Weight for coverage in confidence calculation
consistency_weight=0.3, # Weight for consistency in confidence calculation
saturation_weight=0.3 # Weight for saturation in confidence calculation
confidence_threshold=0.7, # Min confidence to use learned patterns
max_history=100, # Remember last 100 crawls per domain
learning_rate=0.2, # How quickly to adapt to changes
patterns_per_page=3, # Patterns to learn per page type
extraction_strategy='css' # 'css' or 'xpath'
)
# Initialize adaptive crawler with web crawler
adaptive_crawler = AdaptiveCrawler(config)
# First crawl - crawler learns the structure
async with AsyncWebCrawler() as crawler:
adaptive_crawler = AdaptiveCrawler(crawler, config)
# Crawl and learn patterns
state = await adaptive_crawler.digest(
start_url="https://news.example.com/article/12345",
query="latest news articles and content"
result = await crawler.arun(
"https://news.example.com/article/12345",
config=CrawlerRunConfig(
adaptive_config=config,
extraction_hints={ # Optional hints to speed up learning
"title": "article h1",
"content": "article .body-content"
}
)
)
# Access results and confidence
print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
print(f"Pages Crawled: {len(state.crawled_urls)}")
print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
# Crawler identifies and stores patterns
if result.success:
state = adaptive_crawler.get_state("news.example.com")
print(f"Learned {len(state.patterns)} patterns")
print(f"Confidence: {state.avg_confidence:.2%}")
# Subsequent crawls - uses learned patterns
result2 = await crawler.arun(
"https://news.example.com/article/67890",
config=CrawlerRunConfig(adaptive_config=config)
)
# Automatically extracts using learned patterns!
```
**Expected Real-World Impact:**
@@ -81,7 +92,9 @@ twitter_config = VirtualScrollConfig(
container_selector="[data-testid='primaryColumn']",
scroll_count=20, # Number of scrolls
scroll_by="container_height", # Smart scrolling by container size
wait_after_scroll=1.0 # Let content load
wait_after_scroll=1.0, # Let content load
capture_method="incremental", # Capture new content on each scroll
deduplicate=True # Remove duplicate elements
)
# For e-commerce product grids (Instagram style)
@@ -89,7 +102,8 @@ grid_config = VirtualScrollConfig(
container_selector="main .product-grid",
scroll_count=30,
scroll_by=800, # Fixed pixel scrolling
wait_after_scroll=1.5 # Images need time
wait_after_scroll=1.5, # Images need time
stop_on_no_change=True # Smart stopping
)
# For news feeds with lazy loading
@@ -97,7 +111,9 @@ news_config = VirtualScrollConfig(
container_selector=".article-feed",
scroll_count=50,
scroll_by="page_height", # Viewport-based scrolling
wait_after_scroll=0.5 # Wait for content to load
wait_after_scroll=0.5,
wait_for_selector=".article-card", # Wait for specific elements
timeout=30000 # Max 30 seconds total
)
# Use it in your crawl
@@ -144,17 +160,29 @@ async with AsyncWebCrawler() as crawler:
### The Three-Layer Scoring System
```python
from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
from crawl4ai import LinkPreviewConfig
# Configure intelligent link analysis
link_config = LinkPreviewConfig(
# What to analyze
include_internal=True,
include_external=False,
max_links=10,
concurrency=5,
query="python tutorial", # For contextual scoring
score_threshold=0.3,
verbose=True
include_external=True,
max_links=100, # Analyze top 100 links
# Relevance scoring
query="machine learning tutorials", # Your interest
score_threshold=0.3, # Minimum relevance score
# Performance
concurrent_requests=10, # Parallel processing
timeout_per_link=5000, # 5s per link
# Advanced scoring weights
scoring_weights={
"intrinsic": 0.3, # Link quality indicators
"contextual": 0.5, # Relevance to query
"popularity": 0.2 # Link prominence
}
)
# Use in your crawl
@@ -162,51 +190,35 @@ result = await crawler.arun(
"https://tech-blog.example.com",
config=CrawlerRunConfig(
link_preview_config=link_config,
score_links=True, # Enable intrinsic scoring
cache_mode=CacheMode.BYPASS
score_links=True
)
)
# Access scored and sorted links
if result.success and result.links:
# Get scored links
internal_links = result.links.get("internal", [])
scored_links = [l for l in internal_links if l.get("total_score")]
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
# Create a scoring table
table = Table(title="Link Scoring Results", box=box.ROUNDED)
table.add_column("Link Text", style="cyan", width=40)
table.add_column("Intrinsic Score", justify="center")
table.add_column("Contextual Score", justify="center")
table.add_column("Total Score", justify="center", style="bold green")
for link in scored_links[:5]:
text = link.get('text', 'No text')[:40]
table.add_row(
text,
f"{link.get('intrinsic_score', 0):.1f}/10",
f"{link.get('contextual_score', 0):.2f}/1",
f"{link.get('total_score', 0):.3f}"
)
console.print(table)
for link in result.links["internal"][:10]: # Top 10 internal links
print(f"Score: {link['total_score']:.3f}")
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query
print(f" URL: {link['href']}")
print(f" Title: {link['head_data']['title']}")
print(f" Description: {link['head_data']['meta']['description'][:100]}...")
```
**Scoring Components:**
1. **Intrinsic Score**: Based on link quality indicators
1. **Intrinsic Score (0-10)**: Based on link quality indicators
- Position on page (navigation, content, footer)
- Link attributes (rel, title, class names)
- Anchor text quality and length
- URL structure and depth
2. **Contextual Score**: Relevance to your query using BM25 algorithm
2. **Contextual Score (0-1)**: Relevance to your query
- Semantic similarity using embeddings
- Keyword matching in link text and title
- Meta description analysis
- Content preview scoring
3. **Total Score**: Combined score for final ranking
3. **Total Score**: Weighted combination for final ranking
**Expected Real-World Impact:**
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
@@ -228,53 +240,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig
# Basic discovery - find all product pages
seeder_config = SeedingConfig(
# Discovery sources
source="cc+sitemap", # Sitemap + Common Crawl
source="sitemap+cc", # Sitemap + Common Crawl
# Filtering
pattern="*/product/*", # URL pattern matching
ignore_patterns=["*/reviews/*", "*/questions/*"],
# Validation
live_check=True, # Verify URLs are alive
max_urls=50, # Stop at 50 URLs
max_urls=5000, # Stop at 5000 URLs
# Performance
concurrency=100, # Maximum concurrent requests for live checks/head extraction
hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers
concurrency=100, # Parallel requests
hits_per_sec=10 # Rate limiting
)
async with AsyncUrlSeeder() as seeder:
console.print("Discovering URLs from Python docs...")
urls = await seeder.urls("docs.python.org", seeding_config)
console.print(f"\n✓ Discovered {len(urls)} URLs")
seeder = AsyncUrlSeeder(seeder_config)
urls = await seeder.discover("https://shop.example.com")
# Advanced: Relevance-based discovery
research_config = SeedingConfig(
source="sitemap+cc", # Sitemap + Common Crawl
source="crawl+sitemap", # Deep crawl + sitemap
pattern="*/blog/*", # Blog posts only
# Content relevance
extract_head=True, # Get meta tags
query="quantum computing tutorials",
scoring_method="bm25", # BM25 scoring method
scoring_method="bm25", # Or "semantic" (coming soon)
score_threshold=0.4, # High relevance only
# Smart filtering
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
min_content_length=500, # Skip thin content
force=True # Bypass cache
)
# Discover with progress tracking
discovered = []
async with AsyncUrlSeeder() as seeder:
discovered = await seeder.urls("https://physics-blog.com", research_config)
console.print(f"\n✓ Discovered {len(discovered)} URLs")
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
discovered.extend(batch)
print(f"Found {len(discovered)} relevant URLs so far...")
# Results include scores and metadata
for url_data in discovered[:5]:
print(f"URL: {url_data['url']}")
print(f"Score: {url_data['relevance_score']:.3f}")
print(f"Title: {url_data['head_data']['title']}")
print(f"Score: {url_data['score']:.3f}")
print(f"Title: {url_data['title']}")
```
**Discovery Methods:**
@@ -297,18 +309,35 @@ This release includes significant performance improvements through optimized res
### What We Optimized
```python
# Optimized crawling with v0.7.0 improvements
# Before v0.7.0 (slow)
results = []
for url in urls:
result = await crawler.arun(
url,
config=CrawlerRunConfig(
# Performance optimizations
wait_until="domcontentloaded", # Faster than networkidle
cache_mode=CacheMode.ENABLED # Enable caching
)
)
result = await crawler.arun(url)
results.append(result)
# After v0.7.0 (fast)
# Automatic batching and connection pooling
results = await crawler.arun_batch(
urls,
config=CrawlerRunConfig(
# New performance options
batch_size=10, # Process 10 URLs concurrently
reuse_browser=True, # Keep browser warm
eager_loading=False, # Load only what's needed
streaming_extraction=True, # Stream large extractions
# Optimized defaults
wait_until="domcontentloaded", # Faster than networkidle
exclude_external_resources=True, # Skip third-party assets
block_ads=True # Ad blocking built-in
)
)
# Memory-efficient streaming for large crawls
async for result in crawler.arun_stream(large_url_list):
# Process results as they complete
await process_result(result)
# Memory is freed after each iteration
```
**Performance Gains:**
@@ -318,6 +347,24 @@ for url in urls:
- **Memory Usage**: 60% reduction with streaming processing
- **Concurrent Crawls**: Handle 5x more parallel requests
## 📄 PDF Support
PDF extraction is now natively supported in Crawl4AI.
```python
# Extract data from PDF documents
result = await crawler.arun(
"https://example.com/report.pdf",
config=CrawlerRunConfig(
pdf_extraction=True,
extraction_strategy=JsonCssExtractionStrategy({
# Works on converted PDF structure
"title": {"selector": "h1", "type": "text"},
"sections": {"selector": "h2", "type": "list"}
})
)
)
```
## 🔧 Important Changes

View File

@@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
async def main():
async with AsyncWebCrawler() as crawler:
# Create an adaptive crawler (config is optional)
# Create an adaptive crawler
adaptive = AdaptiveCrawler(crawler)
# Start crawling with a query
@@ -59,13 +59,13 @@ async def main():
from crawl4ai import AdaptiveConfig
config = AdaptiveConfig(
confidence_threshold=0.8, # Stop when 80% confident (default: 0.7)
max_pages=30, # Maximum pages to crawl (default: 20)
top_k_links=5, # Links to follow per page (default: 3)
confidence_threshold=0.7, # Stop when 70% confident (default: 0.8)
max_pages=20, # Maximum pages to crawl (default: 50)
top_k_links=3, # Links to follow per page (default: 5)
min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1)
)
adaptive = AdaptiveCrawler(crawler, config)
adaptive = AdaptiveCrawler(crawler, config=config)
```
## Crawling Strategies
@@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False):
The confidence score (0-1) indicates how sufficient the gathered information is:
- **0.0-0.3**: Insufficient information, needs more crawling
- **0.3-0.6**: Partial information, may answer basic queries
- **0.6-0.7**: Good coverage, can answer most queries
- **0.7-1.0**: Excellent coverage, comprehensive information
- **0.6-0.8**: Good coverage, can answer most queries
- **0.8-1.0**: Excellent coverage, comprehensive information
### Statistics Display
@@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl")
- Avoid overly broad queries
### 2. Threshold Tuning
- Start with default (0.7) for general use
- Lower to 0.5-0.6 for exploratory crawling
- Raise to 0.8+ for exhaustive coverage
- Start with default (0.8) for general use
- Lower to 0.6-0.7 for exploratory crawling
- Raise to 0.9+ for exhaustive coverage
### 3. Performance Optimization
- Use appropriate `max_pages` limits

View File

@@ -137,7 +137,7 @@ async def smart_blog_crawler():
word_count_threshold=300 # Only substantial articles
)
# Extract URLs and crawl them
# Extract URLs and stream results as they come
tutorial_urls = [t["url"] for t in tutorials[:10]]
results = await crawler.arun_many(tutorial_urls, config=config)
@@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I
```python
# Use both sources
config = SeedingConfig(source="sitemap+cc")
config = SeedingConfig(source="cc+sitemap")
urls = await seeder.urls("example.com", config)
```
@@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf
| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" |
| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" |
| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
| `extract_head` | bool | False | Extract metadata from page `<head>` |
| `live_check` | bool | False | Verify URLs are accessible |
| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
| `concurrency` | int | 10 | Parallel workers for fetching |
| `hits_per_sec` | int | 5 | Rate limit for requests |
| `hits_per_sec` | int | None | Rate limit for requests |
| `force` | bool | False | Bypass cache, fetch fresh data |
| `verbose` | bool | False | Show detailed progress |
| `query` | str | None | Search query for BM25 scoring |
@@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config)
```python
# Find specific products
config = SeedingConfig(
source="sitemap+cc", # Use both sources
source="cc+sitemap", # Use both sources
extract_head=True,
query="wireless headphones noise canceling",
scoring_method="bm25",
@@ -782,7 +782,7 @@ class ResearchAssistant:
# Step 1: Discover relevant URLs
config = SeedingConfig(
source="sitemap+cc", # Maximum coverage
source="cc+sitemap", # Maximum coverage
extract_head=True, # Get metadata
query=topic, # Research topic
scoring_method="bm25", # Smart scoring
@@ -832,8 +832,7 @@ class ResearchAssistant:
# Extract URLs and crawl all articles
article_urls = [article['url'] for article in top_articles]
results = []
crawl_results = await crawler.arun_many(article_urls, config=config)
async for result in crawl_results:
async for result in await crawler.arun_many(article_urls, config=config):
if result.success:
results.append({
'url': result.url,
@@ -934,10 +933,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5)
# When crawling many URLs
async with AsyncWebCrawler() as crawler:
# Assuming urls is a list of URL strings
crawl_results = await crawler.arun_many(urls, config=config)
results = await crawler.arun_many(urls, config=config)
# Process as they arrive
async for result in crawl_results:
async for result in results:
process_immediately(result) # Don't wait for all
```
@@ -1021,7 +1020,7 @@ config = SeedingConfig(
# E-commerce product discovery
config = SeedingConfig(
source="sitemap+cc",
source="cc+sitemap",
pattern="*/product/*",
extract_head=True,
live_check=True

View File

@@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""
Test suite for playwright-stealth backward compatibility.
Tests that stealth functionality works automatically without user configuration.
"""
import pytest
import asyncio
from unittest.mock import Mock, patch, MagicMock
class TestPlaywrightStealthCompatibility:
"""Test playwright-stealth backward compatibility with transparent operation"""
def test_api_detection_works(self):
"""Test that API detection works correctly"""
from crawl4ai.async_crawler_strategy import STEALTH_NEW_API
# The value depends on which version is installed, but should not be undefined
assert STEALTH_NEW_API is not None or STEALTH_NEW_API is False or STEALTH_NEW_API is None
@pytest.mark.asyncio
@patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', True)
@patch('crawl4ai.async_crawler_strategy.Stealth')
async def test_apply_stealth_new_api(self, mock_stealth_class):
"""Test stealth application with new API works transparently"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Setup mock
mock_stealth_instance = Mock()
mock_stealth_instance.apply_stealth_async = Mock()
mock_stealth_class.return_value = mock_stealth_instance
# Create strategy instance
strategy = AsyncPlaywrightCrawlerStrategy()
# Mock page
mock_page = Mock()
# Test the method - should work transparently
await strategy._apply_stealth(mock_page)
# Verify new API was used
mock_stealth_class.assert_called_once()
mock_stealth_instance.apply_stealth_async.assert_called_once_with(mock_page)
@pytest.mark.asyncio
@patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', False)
async def test_apply_stealth_legacy_api(self):
"""Test stealth application with legacy API works transparently"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Mock stealth_async function by setting it as a module attribute
mock_stealth_async = Mock()
mock_stealth_async.return_value = None
# Import the module to add the mock function
import crawl4ai.async_crawler_strategy
crawl4ai.async_crawler_strategy.stealth_async = mock_stealth_async
try:
# Create strategy instance
strategy = AsyncPlaywrightCrawlerStrategy()
# Mock page
mock_page = Mock()
# Test the method - should work transparently
await strategy._apply_stealth(mock_page)
# Verify legacy API was used
mock_stealth_async.assert_called_once_with(mock_page)
finally:
# Clean up
if hasattr(crawl4ai.async_crawler_strategy, 'stealth_async'):
delattr(crawl4ai.async_crawler_strategy, 'stealth_async')
@pytest.mark.asyncio
@patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', None)
async def test_apply_stealth_no_library(self):
"""Test stealth application when no stealth library is available"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Create strategy instance
strategy = AsyncPlaywrightCrawlerStrategy()
# Mock page
mock_page = Mock()
# Test the method - should work transparently even without stealth
await strategy._apply_stealth(mock_page)
# Should complete without error even when no stealth is available
@pytest.mark.asyncio
@patch('crawl4ai.async_crawler_strategy.STEALTH_NEW_API', True)
@patch('crawl4ai.async_crawler_strategy.Stealth')
async def test_stealth_error_handling(self, mock_stealth_class):
"""Test that stealth errors are handled gracefully without breaking crawling"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Setup mock to raise an error
mock_stealth_instance = Mock()
mock_stealth_instance.apply_stealth_async = Mock(side_effect=Exception("Stealth failed"))
mock_stealth_class.return_value = mock_stealth_instance
# Create strategy instance
strategy = AsyncPlaywrightCrawlerStrategy()
# Mock page
mock_page = Mock()
# Test the method - should not raise an error, continue silently
await strategy._apply_stealth(mock_page)
# Should complete without raising the stealth error
def test_strategy_creation_without_config(self):
"""Test that strategy can be created without any stealth configuration"""
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Should work without any stealth-related parameters
strategy = AsyncPlaywrightCrawlerStrategy()
assert strategy is not None
assert hasattr(strategy, '_apply_stealth')
def test_browser_config_works_without_stealth_param(self):
"""Test that BrowserConfig works without stealth parameter"""
from crawl4ai.async_configs import BrowserConfig
# Should work without stealth parameter
config = BrowserConfig()
assert config is not None
# Should also work with other parameters
config = BrowserConfig(headless=False, browser_type="firefox")
assert config.headless == False
assert config.browser_type == "firefox"
if __name__ == "__main__":
pytest.main([__file__, "-v"])