Implement initial MVP for Docker-based browser management in Crawl4ai, enabling
remote browser execution in containerized environments.
Key Changes:
- Add browser_farm module with Docker support components:
* BrowserFarmService: Manages browser endpoints
* DockerBrowser: Handles Docker browser communication
* Basic health check implementation
* Dockerfile with optimized Chrome/Playwright setup:
- Based on python:3.10-slim for minimal size
- Includes all required system dependencies
- Auto-installs crawl4ai and sets up Playwright
- Configures Chrome with remote debugging
- Uses socat for port forwarding (9223)
- Update core components:
* Rename use_managed_browser to use_remote_browser for clarity
* Modify BrowserManager to support Docker mode
* Add Docker configuration in BrowserConfig
* Update context handling for remote browsers
- Add example:
* hello_world_docker.py demonstrating Docker browser usage
Technical Details:
- Docker container exposes port 9223 (mapped to host:9333)
- Uses CDP (Chrome DevTools Protocol) for remote connection
- Maintains compatibility with existing managed browser features
- Simplified endpoint management for MVP phase
- Optimized Docker setup:
* Minimal dependencies installation
* Proper Chrome flags for containerized environment
* Headless mode with GPU disabled
* Security considerations (no-sandbox mode)
Testing:
- Extensive Docker configuration testing and optimization
- Verified with hello_world_docker.py example
- Confirmed remote browser connection and crawling functionality
- Tested basic health checks
This is the first step towards a scalable browser farm solution, setting up
the foundation for future enhancements like resource monitoring, multiple
browser instances, and container lifecycle management.
231 lines
8.2 KiB
Python
231 lines
8.2 KiB
Python
import os, sys
|
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
sys.path.append(parent_dir)
|
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
|
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
|
from crawl4ai.chunking_strategy import RegexChunking
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
|
|
# Category 1: Browser Configuration Tests
|
|
async def test_browser_config_object():
|
|
"""Test the new BrowserConfig object with various browser settings"""
|
|
browser_config = BrowserConfig(
|
|
browser_type="chromium",
|
|
headless=False,
|
|
viewport_width=1920,
|
|
viewport_height=1080,
|
|
use_remote_browser=True,
|
|
user_agent_mode="random",
|
|
user_agent_generator_config={"device_type": "desktop", "os_type": "windows"}
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
|
|
result = await crawler.arun('https://example.com', cache_mode=CacheMode.BYPASS)
|
|
assert result.success, "Browser config crawl failed"
|
|
assert len(result.html) > 0, "No HTML content retrieved"
|
|
|
|
async def test_browser_performance_config():
|
|
"""Test browser configurations focused on performance"""
|
|
browser_config = BrowserConfig(
|
|
text_mode=True,
|
|
light_mode=True,
|
|
extra_args=['--disable-gpu', '--disable-software-rasterizer'],
|
|
ignore_https_errors=True,
|
|
java_script_enabled=False
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun('https://example.com')
|
|
assert result.success, "Performance optimized crawl failed"
|
|
assert result.status_code == 200, "Unexpected status code"
|
|
|
|
# Category 2: Content Processing Tests
|
|
async def test_content_extraction_config():
|
|
"""Test content extraction with various strategies"""
|
|
crawler_config = CrawlerRunConfig(
|
|
word_count_threshold=300,
|
|
extraction_strategy=JsonCssExtractionStrategy(
|
|
schema={
|
|
"name": "article",
|
|
"baseSelector": "div",
|
|
"fields": [{
|
|
"name": "title",
|
|
"selector": "h1",
|
|
"type": "text"
|
|
}]
|
|
}
|
|
),
|
|
chunking_strategy=RegexChunking(),
|
|
content_filter=PruningContentFilter()
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
'https://example.com/article',
|
|
config=crawler_config
|
|
)
|
|
assert result.extracted_content is not None, "Content extraction failed"
|
|
assert 'title' in result.extracted_content, "Missing expected content field"
|
|
|
|
# Category 3: Cache and Session Management Tests
|
|
async def test_cache_and_session_management():
|
|
"""Test different cache modes and session handling"""
|
|
browser_config = BrowserConfig(use_persistent_context=True)
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.WRITE_ONLY,
|
|
process_iframes=True,
|
|
remove_overlay_elements=True
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
# First request - should write to cache
|
|
result1 = await crawler.arun(
|
|
'https://example.com',
|
|
config=crawler_config
|
|
)
|
|
|
|
# Second request - should use fresh fetch due to WRITE_ONLY mode
|
|
result2 = await crawler.arun(
|
|
'https://example.com',
|
|
config=crawler_config
|
|
)
|
|
|
|
assert result1.success and result2.success, "Cache mode crawl failed"
|
|
assert result1.html == result2.html, "Inconsistent results between requests"
|
|
|
|
# Category 4: Media Handling Tests
|
|
async def test_media_handling_config():
|
|
"""Test configurations related to media handling"""
|
|
# Get the base path for home directroy ~/.crawl4ai/downloads, make sure it exists
|
|
os.makedirs(os.path.expanduser("~/.crawl4ai/downloads"), exist_ok=True)
|
|
browser_config = BrowserConfig(
|
|
viewport_width=1920,
|
|
viewport_height=1080,
|
|
accept_downloads=True,
|
|
downloads_path= os.path.expanduser("~/.crawl4ai/downloads")
|
|
)
|
|
crawler_config = CrawlerRunConfig(
|
|
screenshot=True,
|
|
pdf=True,
|
|
adjust_viewport_to_content=True,
|
|
wait_for_images=True,
|
|
screenshot_height_threshold=20000
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(
|
|
'https://example.com',
|
|
config=crawler_config
|
|
)
|
|
assert result.screenshot is not None, "Screenshot capture failed"
|
|
assert result.pdf is not None, "PDF generation failed"
|
|
|
|
# Category 5: Anti-Bot and Site Interaction Tests
|
|
async def test_antibot_config():
|
|
"""Test configurations for handling anti-bot measures"""
|
|
crawler_config = CrawlerRunConfig(
|
|
simulate_user=True,
|
|
override_navigator=True,
|
|
magic=True,
|
|
wait_for="js:()=>document.querySelector('body')",
|
|
delay_before_return_html=1.0,
|
|
log_console=True,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
'https://example.com',
|
|
config=crawler_config
|
|
)
|
|
assert result.success, "Anti-bot measure handling failed"
|
|
|
|
# Category 6: Parallel Processing Tests
|
|
async def test_parallel_processing():
|
|
"""Test parallel processing capabilities"""
|
|
crawler_config = CrawlerRunConfig(
|
|
mean_delay=0.5,
|
|
max_range=1.0,
|
|
semaphore_count=5
|
|
)
|
|
|
|
urls = [
|
|
'https://example.com/1',
|
|
'https://example.com/2',
|
|
'https://example.com/3'
|
|
]
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
results = await crawler.arun_many(
|
|
urls,
|
|
config=crawler_config
|
|
)
|
|
assert len(results) == len(urls), "Not all URLs were processed"
|
|
assert all(r.success for r in results), "Some parallel requests failed"
|
|
|
|
# Category 7: Backwards Compatibility Tests
|
|
async def test_legacy_parameter_support():
|
|
"""Test that legacy parameters still work"""
|
|
async with AsyncWebCrawler(
|
|
headless=True,
|
|
browser_type="chromium",
|
|
viewport_width=1024,
|
|
viewport_height=768
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
'https://example.com',
|
|
screenshot=True,
|
|
word_count_threshold=200,
|
|
bypass_cache=True,
|
|
css_selector=".main-content"
|
|
)
|
|
assert result.success, "Legacy parameter support failed"
|
|
|
|
# Category 8: Mixed Configuration Tests
|
|
async def test_mixed_config_usage():
|
|
"""Test mixing new config objects with legacy parameters"""
|
|
browser_config = BrowserConfig(headless=True)
|
|
crawler_config = CrawlerRunConfig(screenshot=True)
|
|
|
|
async with AsyncWebCrawler(
|
|
config=browser_config,
|
|
verbose=True # legacy parameter
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
'https://example.com',
|
|
config=crawler_config,
|
|
cache_mode=CacheMode.BYPASS, # legacy parameter
|
|
css_selector="body" # legacy parameter
|
|
)
|
|
assert result.success, "Mixed configuration usage failed"
|
|
|
|
if __name__ == "__main__":
|
|
async def run_tests():
|
|
test_functions = [
|
|
test_browser_config_object,
|
|
# test_browser_performance_config,
|
|
# test_content_extraction_config,
|
|
# test_cache_and_session_management,
|
|
# test_media_handling_config,
|
|
# test_antibot_config,
|
|
# test_parallel_processing,
|
|
# test_legacy_parameter_support,
|
|
# test_mixed_config_usage
|
|
]
|
|
|
|
for test in test_functions:
|
|
print(f"\nRunning {test.__name__}...")
|
|
try:
|
|
await test()
|
|
print(f"✓ {test.__name__} passed")
|
|
except AssertionError as e:
|
|
print(f"✗ {test.__name__} failed: {str(e)}")
|
|
except Exception as e:
|
|
print(f"✗ {test.__name__} error: {str(e)}")
|
|
|
|
asyncio.run(run_tests()) |