Implement initial MVP for Docker-based browser management in Crawl4ai, enabling
remote browser execution in containerized environments.
Key Changes:
- Add browser_farm module with Docker support components:
* BrowserFarmService: Manages browser endpoints
* DockerBrowser: Handles Docker browser communication
* Basic health check implementation
* Dockerfile with optimized Chrome/Playwright setup:
- Based on python:3.10-slim for minimal size
- Includes all required system dependencies
- Auto-installs crawl4ai and sets up Playwright
- Configures Chrome with remote debugging
- Uses socat for port forwarding (9223)
- Update core components:
* Rename use_managed_browser to use_remote_browser for clarity
* Modify BrowserManager to support Docker mode
* Add Docker configuration in BrowserConfig
* Update context handling for remote browsers
- Add example:
* hello_world_docker.py demonstrating Docker browser usage
Technical Details:
- Docker container exposes port 9223 (mapped to host:9333)
- Uses CDP (Chrome DevTools Protocol) for remote connection
- Maintains compatibility with existing managed browser features
- Simplified endpoint management for MVP phase
- Optimized Docker setup:
* Minimal dependencies installation
* Proper Chrome flags for containerized environment
* Headless mode with GPU disabled
* Security considerations (no-sandbox mode)
Testing:
- Extensive Docker configuration testing and optimization
- Verified with hello_world_docker.py example
- Confirmed remote browser connection and crawling functionality
- Tested basic health checks
This is the first step towards a scalable browser farm solution, setting up
the foundation for future enhancements like resource monitoring, multiple
browser instances, and container lifecycle management.
154 lines
6.0 KiB
Python
154 lines
6.0 KiB
Python
import os, sys
|
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
sys.path.append(parent_dir)
|
|
__location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
|
|
|
import os, sys
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
|
|
# Assuming that the changes made allow different configurations
|
|
# for managed browser, persistent context, and so forth.
|
|
|
|
async def test_default_headless():
|
|
async with AsyncWebCrawler(
|
|
headless=True,
|
|
verbose=True,
|
|
user_agent_mode="random",
|
|
user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
|
|
use_remote_browser=False,
|
|
use_persistent_context=False,
|
|
ignore_https_errors=True,
|
|
# Testing normal ephemeral context
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url='https://www.kidocode.com/degrees/technology',
|
|
cache_mode=CacheMode.BYPASS,
|
|
markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
|
|
)
|
|
print("[test_default_headless] success:", result.success)
|
|
print("HTML length:", len(result.html if result.html else ""))
|
|
|
|
async def test_remote_browser_persistent():
|
|
# Treating use_persistent_context=True as remote_browser scenario.
|
|
async with AsyncWebCrawler(
|
|
headless=False,
|
|
verbose=True,
|
|
user_agent_mode="random",
|
|
user_agent_generator_config={"device_type": "desktop", "os_type": "mac"},
|
|
use_remote_browser=True,
|
|
use_persistent_context=True, # now should behave same as managed browser
|
|
user_data_dir="./outpu/test_profile",
|
|
# This should store and reuse profile data across runs
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url='https://www.google.com',
|
|
cache_mode=CacheMode.BYPASS,
|
|
markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
|
|
)
|
|
print("[test_remote_browser_persistent] success:", result.success)
|
|
print("HTML length:", len(result.html if result.html else ""))
|
|
|
|
async def test_session_reuse():
|
|
# Test creating a session, using it for multiple calls
|
|
session_id = "my_session"
|
|
async with AsyncWebCrawler(
|
|
headless=False,
|
|
verbose=True,
|
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
|
# Fixed user-agent for consistency
|
|
use_remote_browser=False,
|
|
use_persistent_context=False,
|
|
) as crawler:
|
|
|
|
# First call: create session
|
|
result1 = await crawler.arun(
|
|
url='https://www.example.com',
|
|
cache_mode=CacheMode.BYPASS,
|
|
session_id=session_id,
|
|
markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
|
|
)
|
|
print("[test_session_reuse first call] success:", result1.success)
|
|
|
|
# Second call: same session, possibly cookie retained
|
|
result2 = await crawler.arun(
|
|
url='https://www.example.com/about',
|
|
cache_mode=CacheMode.BYPASS,
|
|
session_id=session_id,
|
|
markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
|
|
)
|
|
print("[test_session_reuse second call] success:", result2.success)
|
|
|
|
async def test_magic_mode():
|
|
# Test magic mode with override_navigator and simulate_user
|
|
async with AsyncWebCrawler(
|
|
headless=False,
|
|
verbose=True,
|
|
user_agent_mode="random",
|
|
user_agent_generator_config={"device_type": "desktop", "os_type": "windows"},
|
|
use_remote_browser=False,
|
|
use_persistent_context=False,
|
|
magic=True,
|
|
override_navigator=True,
|
|
simulate_user=True,
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url='https://www.kidocode.com/degrees/business',
|
|
cache_mode=CacheMode.BYPASS,
|
|
markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
|
|
)
|
|
print("[test_magic_mode] success:", result.success)
|
|
print("HTML length:", len(result.html if result.html else ""))
|
|
|
|
async def test_proxy_settings():
|
|
# Test with a proxy (if available) to ensure code runs with proxy
|
|
async with AsyncWebCrawler(
|
|
headless=True,
|
|
verbose=False,
|
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
|
proxy="http://127.0.0.1:8080", # Assuming local proxy server for test
|
|
use_remote_browser=False,
|
|
use_persistent_context=False,
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url='https://httpbin.org/ip',
|
|
cache_mode=CacheMode.BYPASS,
|
|
markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
|
|
)
|
|
print("[test_proxy_settings] success:", result.success)
|
|
if result.success:
|
|
print("HTML preview:", result.html[:200] if result.html else "")
|
|
|
|
async def test_ignore_https_errors():
|
|
# Test ignore HTTPS errors with a self-signed or invalid cert domain
|
|
# This is just conceptual, the domain should be one that triggers SSL error.
|
|
# Using a hypothetical URL that fails SSL:
|
|
async with AsyncWebCrawler(
|
|
headless=True,
|
|
verbose=True,
|
|
user_agent="Mozilla/5.0",
|
|
ignore_https_errors=True,
|
|
use_remote_browser=False,
|
|
use_persistent_context=False,
|
|
) as crawler:
|
|
result = await crawler.arun(
|
|
url='https://self-signed.badssl.com/',
|
|
cache_mode=CacheMode.BYPASS,
|
|
markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
|
|
)
|
|
print("[test_ignore_https_errors] success:", result.success)
|
|
|
|
async def main():
|
|
print("Running tests...")
|
|
# await test_default_headless()
|
|
# await test_remote_browser_persistent()
|
|
# await test_session_reuse()
|
|
# await test_magic_mode()
|
|
# await test_proxy_settings()
|
|
await test_ignore_https_errors()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|