- Add playwright-stealth integration with enable_stealth parameter in BrowserConfig - Merge undetected browser strategy into main async_crawler_strategy.py using adapter pattern - Add browser adapters (BrowserAdapter, PlaywrightAdapter, UndetectedAdapter) for flexible browser switching - Update install.py to install both playwright and patchright browsers automatically - Add comprehensive documentation for anti-bot features (stealth mode + undetected browser) - Create examples demonstrating stealth mode usage and comparison tests - Update pyproject.toml and requirements.txt with patchright>=1.49.0 and other dependencies - Remove duplicate/unused dependencies (alphashape, cssselect, pyperclip, shapely, selenium) - Add dependency checker tool in tests/check_dependencies.py Breaking changes: None - all existing functionality preserved 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
57 lines
1.7 KiB
Python
57 lines
1.7 KiB
Python
import asyncio
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
BrowserConfig,
|
|
CrawlerRunConfig,
|
|
DefaultMarkdownGenerator,
|
|
PruningContentFilter,
|
|
CrawlResult,
|
|
UndetectedAdapter
|
|
)
|
|
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
|
|
|
|
|
async def main():
|
|
# Create browser config
|
|
browser_config = BrowserConfig(
|
|
headless=False,
|
|
verbose=True,
|
|
)
|
|
|
|
# Create the undetected adapter
|
|
undetected_adapter = UndetectedAdapter()
|
|
|
|
# Create the crawler strategy with the undetected adapter
|
|
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
|
browser_config=browser_config,
|
|
browser_adapter=undetected_adapter
|
|
)
|
|
|
|
# Create the crawler with our custom strategy
|
|
async with AsyncWebCrawler(
|
|
crawler_strategy=crawler_strategy,
|
|
config=browser_config
|
|
) as crawler:
|
|
# Configure the crawl
|
|
crawler_config = CrawlerRunConfig(
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
content_filter=PruningContentFilter()
|
|
),
|
|
capture_console_messages=True, # Enable console capture to test adapter
|
|
)
|
|
|
|
# Test on a site that typically detects bots
|
|
print("Testing undetected adapter...")
|
|
result: CrawlResult = await crawler.arun(
|
|
url="https://www.helloworld.org",
|
|
config=crawler_config
|
|
)
|
|
|
|
print(f"Status: {result.status_code}")
|
|
print(f"Success: {result.success}")
|
|
print(f"Console messages captured: {len(result.console_messages or [])}")
|
|
print(f"Markdown content (first 500 chars):\n{result.markdown.raw_markdown[:500]}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |