Merge branch 'main' into fix-raw-url-parsing

This commit is contained in:
João Martins
2025-05-30 13:03:25 +01:00
255 changed files with 67002 additions and 6739 deletions

View File

@@ -7,6 +7,7 @@ import json
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
from crawl4ai import LLMConfig
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.chunking_strategy import RegexChunking
from crawl4ai.extraction_strategy import LLMExtractionStrategy
@@ -48,8 +49,7 @@ async def test_llm_extraction_strategy():
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://www.nbcnews.com/business"
extraction_strategy = LLMExtractionStrategy(
provider="openai/gpt-4o-mini",
api_token=os.getenv("OPENAI_API_KEY"),
llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
instruction="Extract only content related to technology",
)
result = await crawler.arun(

View File

@@ -0,0 +1,4 @@
"""Docker browser strategy tests.
This package contains tests for the Docker browser strategy implementation.
"""

View File

@@ -0,0 +1,651 @@
"""Test examples for Docker Browser Strategy.
These examples demonstrate the functionality of Docker Browser Strategy
and serve as functional tests.
"""
import asyncio
import os
import sys
import shutil
import uuid
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
from crawl4ai.browser import BrowserManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
from crawl4ai.browser import DockerConfig
from crawl4ai.browser import DockerRegistry
from crawl4ai.browser import DockerUtils
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
# Global Docker utils instance
docker_utils = DockerUtils(logger)
async def test_docker_components():
"""Test Docker utilities, registry, and image building.
This function tests the core Docker components before running the browser tests.
It validates DockerRegistry, DockerUtils, and builds test images to ensure
everything is functioning correctly.
"""
logger.info("Testing Docker components", tag="SETUP")
# Create a test registry directory
registry_dir = os.path.join(os.path.dirname(__file__), "test_registry")
registry_file = os.path.join(registry_dir, "test_registry.json")
os.makedirs(registry_dir, exist_ok=True)
try:
# 1. Test DockerRegistry
logger.info("Testing DockerRegistry...", tag="SETUP")
registry = DockerRegistry(registry_file)
# Test saving and loading registry
test_container_id = "test-container-123"
registry.register_container(test_container_id, 9876, "test-hash-123")
registry.save()
# Create a new registry instance that loads from the file
registry2 = DockerRegistry(registry_file)
port = registry2.get_container_host_port(test_container_id)
hash_value = registry2.get_container_config_hash(test_container_id)
if port != 9876 or hash_value != "test-hash-123":
logger.error("DockerRegistry persistence failed", tag="SETUP")
return False
# Clean up test container from registry
registry2.unregister_container(test_container_id)
logger.success("DockerRegistry works correctly", tag="SETUP")
# 2. Test DockerUtils
logger.info("Testing DockerUtils...", tag="SETUP")
# Test port detection
in_use = docker_utils.is_port_in_use(22) # SSH port is usually in use
logger.info(f"Port 22 in use: {in_use}", tag="SETUP")
# Get next available port
available_port = docker_utils.get_next_available_port(9000)
logger.info(f"Next available port: {available_port}", tag="SETUP")
# Test config hash generation
config_dict = {"mode": "connect", "headless": True}
config_hash = docker_utils.generate_config_hash(config_dict)
logger.info(f"Generated config hash: {config_hash[:8]}...", tag="SETUP")
# 3. Test Docker is available
logger.info("Checking Docker availability...", tag="SETUP")
if not await check_docker_available():
logger.error("Docker is not available - cannot continue tests", tag="SETUP")
return False
# 4. Test building connect image
logger.info("Building connect mode Docker image...", tag="SETUP")
connect_image = await docker_utils.ensure_docker_image_exists(None, "connect")
if not connect_image:
logger.error("Failed to build connect mode image", tag="SETUP")
return False
logger.success(f"Successfully built connect image: {connect_image}", tag="SETUP")
# 5. Test building launch image
logger.info("Building launch mode Docker image...", tag="SETUP")
launch_image = await docker_utils.ensure_docker_image_exists(None, "launch")
if not launch_image:
logger.error("Failed to build launch mode image", tag="SETUP")
return False
logger.success(f"Successfully built launch image: {launch_image}", tag="SETUP")
# 6. Test creating and removing container
logger.info("Testing container creation and removal...", tag="SETUP")
container_id = await docker_utils.create_container(
image_name=launch_image,
host_port=available_port,
container_name="crawl4ai-test-container"
)
if not container_id:
logger.error("Failed to create test container", tag="SETUP")
return False
logger.info(f"Created test container: {container_id[:12]}", tag="SETUP")
# Verify container is running
running = await docker_utils.is_container_running(container_id)
if not running:
logger.error("Test container is not running", tag="SETUP")
await docker_utils.remove_container(container_id)
return False
# Test commands in container
logger.info("Testing command execution in container...", tag="SETUP")
returncode, stdout, stderr = await docker_utils.exec_in_container(
container_id, ["ls", "-la", "/"]
)
if returncode != 0:
logger.error(f"Command execution failed: {stderr}", tag="SETUP")
await docker_utils.remove_container(container_id)
return False
# Verify Chrome is installed in the container
returncode, stdout, stderr = await docker_utils.exec_in_container(
container_id, ["which", "chromium"]
)
if returncode != 0:
logger.error("Chrome not found in container", tag="SETUP")
await docker_utils.remove_container(container_id)
return False
chrome_path = stdout.strip()
logger.info(f"Chrome found at: {chrome_path}", tag="SETUP")
# Test Chrome version
returncode, stdout, stderr = await docker_utils.exec_in_container(
container_id, ["chromium", "--version"]
)
if returncode != 0:
logger.error(f"Failed to get Chrome version: {stderr}", tag="SETUP")
await docker_utils.remove_container(container_id)
return False
logger.info(f"Chrome version: {stdout.strip()}", tag="SETUP")
# Remove test container
removed = await docker_utils.remove_container(container_id)
if not removed:
logger.error("Failed to remove test container", tag="SETUP")
return False
logger.success("Test container removed successfully", tag="SETUP")
# All components tested successfully
logger.success("All Docker components tested successfully", tag="SETUP")
return True
except Exception as e:
logger.error(f"Docker component tests failed: {str(e)}", tag="SETUP")
return False
finally:
# Clean up registry test directory
if os.path.exists(registry_dir):
shutil.rmtree(registry_dir)
async def test_docker_connect_mode():
"""Test Docker browser in connect mode.
This tests the basic functionality of creating a browser in Docker
connect mode and using it for navigation.
"""
logger.info("Testing Docker browser in connect mode", tag="TEST")
# Create temp directory for user data
temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data")
os.makedirs(temp_dir, exist_ok=True)
try:
# Create Docker configuration
docker_config = DockerConfig(
mode="connect",
persistent=False,
remove_on_exit=True,
user_data_dir=temp_dir
)
# Create browser configuration
browser_config = BrowserConfig(
browser_mode="docker",
headless=True,
docker_config=docker_config
)
# Create browser manager
manager = BrowserManager(browser_config=browser_config, logger=logger)
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Create crawler config
crawler_config = CrawlerRunConfig(url="https://example.com")
# Get a page
page, context = await manager.get_page(crawler_config)
logger.info("Got page successfully", tag="TEST")
# Navigate to a website
await page.goto("https://example.com")
logger.info("Navigated to example.com", tag="TEST")
# Get page title
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Ensure cleanup
try:
await manager.close()
except:
pass
return False
finally:
# Clean up the temp directory
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
async def test_docker_launch_mode():
"""Test Docker browser in launch mode.
This tests launching a Chrome browser within a Docker container
on demand with custom settings.
"""
logger.info("Testing Docker browser in launch mode", tag="TEST")
# Create temp directory for user data
temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data_launch")
os.makedirs(temp_dir, exist_ok=True)
try:
# Create Docker configuration
docker_config = DockerConfig(
mode="launch",
persistent=False,
remove_on_exit=True,
user_data_dir=temp_dir
)
# Create browser configuration
browser_config = BrowserConfig(
browser_mode="docker",
headless=True,
text_mode=True, # Enable text mode for faster operation
docker_config=docker_config
)
# Create browser manager
manager = BrowserManager(browser_config=browser_config, logger=logger)
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Create crawler config
crawler_config = CrawlerRunConfig(url="https://example.com")
# Get a page
page, context = await manager.get_page(crawler_config)
logger.info("Got page successfully", tag="TEST")
# Navigate to a website
await page.goto("https://example.com")
logger.info("Navigated to example.com", tag="TEST")
# Get page title
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Ensure cleanup
try:
await manager.close()
except:
pass
return False
finally:
# Clean up the temp directory
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
async def test_docker_persistent_storage():
"""Test Docker browser with persistent storage.
This tests creating localStorage data in one session and verifying
it persists to another session when using persistent storage.
"""
logger.info("Testing Docker browser with persistent storage", tag="TEST")
# Create a unique temp directory
test_id = uuid.uuid4().hex[:8]
temp_dir = os.path.join(os.path.dirname(__file__), f"tmp_user_data_persist_{test_id}")
os.makedirs(temp_dir, exist_ok=True)
manager1 = None
manager2 = None
try:
# Create Docker configuration with persistence
docker_config = DockerConfig(
mode="connect",
persistent=True, # Keep container running between sessions
user_data_dir=temp_dir,
container_user_data_dir="/data"
)
# Create browser configuration
browser_config = BrowserConfig(
browser_mode="docker",
headless=True,
docker_config=docker_config
)
# Create first browser manager
manager1 = BrowserManager(browser_config=browser_config, logger=logger)
# Start the browser
await manager1.start()
logger.info("First browser started successfully", tag="TEST")
# Create crawler config
crawler_config = CrawlerRunConfig()
# Get a page
page1, context1 = await manager1.get_page(crawler_config)
# Navigate to example.com
await page1.goto("https://example.com")
# Set localStorage item
test_value = f"test_value_{test_id}"
await page1.evaluate(f"localStorage.setItem('test_key', '{test_value}')")
logger.info(f"Set localStorage test_key = {test_value}", tag="TEST")
# Close the first browser manager
await manager1.close()
logger.info("First browser closed", tag="TEST")
# Create second browser manager with same config
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
# Start the browser
await manager2.start()
logger.info("Second browser started successfully", tag="TEST")
# Get a page
page2, context2 = await manager2.get_page(crawler_config)
# Navigate to same site
await page2.goto("https://example.com")
# Get localStorage item
value = await page2.evaluate("localStorage.getItem('test_key')")
logger.info(f"Retrieved localStorage test_key = {value}", tag="TEST")
# Check if persistence worked
if value == test_value:
logger.success("Storage persistence verified!", tag="TEST")
else:
logger.error(f"Storage persistence failed! Expected {test_value}, got {value}", tag="TEST")
# Clean up
await manager2.close()
logger.info("Second browser closed successfully", tag="TEST")
return value == test_value
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Ensure cleanup
try:
if manager1:
await manager1.close()
if manager2:
await manager2.close()
except:
pass
return False
finally:
# Clean up the temp directory
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
async def test_docker_parallel_pages():
"""Test Docker browser with parallel page creation.
This tests the ability to create and use multiple pages in parallel
from a single Docker browser instance.
"""
logger.info("Testing Docker browser with parallel pages", tag="TEST")
try:
# Create Docker configuration
docker_config = DockerConfig(
mode="connect",
persistent=False,
remove_on_exit=True
)
# Create browser configuration
browser_config = BrowserConfig(
browser_mode="docker",
headless=True,
docker_config=docker_config
)
# Create browser manager
manager = BrowserManager(browser_config=browser_config, logger=logger)
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Create crawler config
crawler_config = CrawlerRunConfig()
# Get multiple pages
page_count = 3
pages = await manager.get_pages(crawler_config, count=page_count)
logger.info(f"Got {len(pages)} pages successfully", tag="TEST")
if len(pages) != page_count:
logger.error(f"Expected {page_count} pages, got {len(pages)}", tag="TEST")
await manager.close()
return False
# Navigate to different sites with each page
tasks = []
for i, (page, _) in enumerate(pages):
tasks.append(page.goto(f"https://example.com?page={i}"))
# Wait for all navigations to complete
await asyncio.gather(*tasks)
logger.info("All pages navigated successfully", tag="TEST")
# Get titles from all pages
titles = []
for i, (page, _) in enumerate(pages):
title = await page.title()
titles.append(title)
logger.info(f"Page {i+1} title: {title}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Ensure cleanup
try:
await manager.close()
except:
pass
return False
async def test_docker_registry_reuse():
"""Test Docker container reuse via registry.
This tests that containers with matching configurations
are reused rather than creating new ones.
"""
logger.info("Testing Docker container reuse via registry", tag="TEST")
# Create registry for this test
registry_dir = os.path.join(os.path.dirname(__file__), "registry_reuse_test")
registry_file = os.path.join(registry_dir, "registry.json")
os.makedirs(registry_dir, exist_ok=True)
manager1 = None
manager2 = None
container_id1 = None
try:
# Create identical Docker configurations with custom registry
docker_config1 = DockerConfig(
mode="connect",
persistent=True, # Keep container running after closing
registry_file=registry_file
)
# Create first browser configuration
browser_config1 = BrowserConfig(
browser_mode="docker",
headless=True,
docker_config=docker_config1
)
# Create first browser manager
manager1 = BrowserManager(browser_config=browser_config1, logger=logger)
# Start the first browser
await manager1.start()
logger.info("First browser started successfully", tag="TEST")
# Get container ID from the strategy
docker_strategy1 = manager1.strategy
container_id1 = docker_strategy1.container_id
logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST")
# Close the first manager but keep container running
await manager1.close()
logger.info("First browser closed", tag="TEST")
# Create second Docker configuration identical to first
docker_config2 = DockerConfig(
mode="connect",
persistent=True,
registry_file=registry_file
)
# Create second browser configuration
browser_config2 = BrowserConfig(
browser_mode="docker",
headless=True,
docker_config=docker_config2
)
# Create second browser manager
manager2 = BrowserManager(browser_config=browser_config2, logger=logger)
# Start the second browser - should reuse existing container
await manager2.start()
logger.info("Second browser started successfully", tag="TEST")
# Get container ID from the second strategy
docker_strategy2 = manager2.strategy
container_id2 = docker_strategy2.container_id
logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST")
# Verify container reuse
if container_id1 == container_id2:
logger.success("Container reuse successful - using same container!", tag="TEST")
else:
logger.error("Container reuse failed - new container created!", tag="TEST")
# Clean up
docker_strategy2.docker_config.persistent = False
docker_strategy2.docker_config.remove_on_exit = True
await manager2.close()
logger.info("Second browser closed and container removed", tag="TEST")
return container_id1 == container_id2
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Ensure cleanup
try:
if manager1:
await manager1.close()
if manager2:
await manager2.close()
# Make sure container is removed
if container_id1:
await docker_utils.remove_container(container_id1, force=True)
except:
pass
return False
finally:
# Clean up registry directory
if os.path.exists(registry_dir):
shutil.rmtree(registry_dir)
async def run_tests():
"""Run all tests sequentially."""
results = []
logger.info("Starting Docker Browser Strategy tests", tag="TEST")
# Check if Docker is available
if not await check_docker_available():
logger.error("Docker is not available - skipping tests", tag="TEST")
return
# First test Docker components
# setup_result = await test_docker_components()
# if not setup_result:
# logger.error("Docker component tests failed - skipping browser tests", tag="TEST")
# return
# Run browser tests
results.append(await test_docker_connect_mode())
results.append(await test_docker_launch_mode())
results.append(await test_docker_persistent_storage())
results.append(await test_docker_parallel_pages())
results.append(await test_docker_registry_reuse())
# Print summary
total = len(results)
passed = sum(1 for r in results if r)
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
if passed == total:
logger.success("All tests passed!", tag="SUMMARY")
else:
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
async def check_docker_available() -> bool:
"""Check if Docker is available on the system.
Returns:
bool: True if Docker is available, False otherwise
"""
try:
proc = await asyncio.create_subprocess_exec(
"docker", "--version",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, _ = await proc.communicate()
return proc.returncode == 0 and stdout
except:
return False
if __name__ == "__main__":
asyncio.run(run_tests())

View File

@@ -0,0 +1,525 @@
"""Demo script for testing the enhanced BrowserManager.
This script demonstrates the browser pooling capabilities of the enhanced
BrowserManager with various configurations and usage patterns.
"""
import asyncio
import time
import random
from crawl4ai.browser.manager import BrowserManager, UnavailableBehavior
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
import playwright
SAFE_URLS = [
"https://example.com",
"https://example.com/page1",
"https://httpbin.org/get",
"https://httpbin.org/html",
"https://httpbin.org/ip",
"https://httpbin.org/user-agent",
"https://httpbin.org/headers",
"https://httpbin.org/cookies",
"https://httpstat.us/200",
"https://httpstat.us/301",
"https://httpstat.us/404",
"https://httpstat.us/500",
"https://jsonplaceholder.typicode.com/posts/1",
"https://jsonplaceholder.typicode.com/posts/2",
"https://jsonplaceholder.typicode.com/posts/3",
"https://jsonplaceholder.typicode.com/posts/4",
"https://jsonplaceholder.typicode.com/posts/5",
"https://jsonplaceholder.typicode.com/comments/1",
"https://jsonplaceholder.typicode.com/comments/2",
"https://jsonplaceholder.typicode.com/users/1",
"https://jsonplaceholder.typicode.com/users/2",
"https://jsonplaceholder.typicode.com/albums/1",
"https://jsonplaceholder.typicode.com/albums/2",
"https://jsonplaceholder.typicode.com/photos/1",
"https://jsonplaceholder.typicode.com/photos/2",
"https://jsonplaceholder.typicode.com/todos/1",
"https://jsonplaceholder.typicode.com/todos/2",
"https://www.iana.org",
"https://www.iana.org/domains",
"https://www.iana.org/numbers",
"https://www.iana.org/protocols",
"https://www.iana.org/about",
"https://www.iana.org/time-zones",
"https://www.data.gov",
"https://catalog.data.gov/dataset",
"https://www.archives.gov",
"https://www.usa.gov",
"https://www.loc.gov",
"https://www.irs.gov",
"https://www.census.gov",
"https://www.bls.gov",
"https://www.gpo.gov",
"https://www.w3.org",
"https://www.w3.org/standards",
"https://www.w3.org/WAI",
"https://www.rfc-editor.org",
"https://www.ietf.org",
"https://www.icann.org",
"https://www.internetsociety.org",
"https://www.python.org"
]
async def basic_pooling_demo():
"""Demonstrate basic browser pooling functionality."""
print("\n=== Basic Browser Pooling Demo ===")
# Create logger
logger = AsyncLogger(verbose=True)
# Create browser configurations
config1 = BrowserConfig(
browser_type="chromium",
headless=True,
browser_mode="playwright"
)
config2 = BrowserConfig(
browser_type="chromium",
headless=True,
browser_mode="cdp"
)
# Create browser manager with on-demand behavior
manager = BrowserManager(
browser_config=config1,
logger=logger,
unavailable_behavior=UnavailableBehavior.ON_DEMAND,
max_browsers_per_config=3
)
try:
# Initialize pool with both configurations
print("Initializing browser pool...")
await manager.initialize_pool(
browser_configs=[config1, config2],
browsers_per_config=2
)
# Display initial pool status
status = await manager.get_pool_status()
print(f"Initial pool status: {status}")
# Create crawler run configurations
run_config1 = CrawlerRunConfig()
run_config2 = CrawlerRunConfig()
# Simulate concurrent page requests
print("\nGetting pages for parallel crawling...")
# Function to simulate crawling
async def simulate_crawl(index: int, config: BrowserConfig, run_config: CrawlerRunConfig):
print(f"Crawler {index}: Requesting page...")
page, context, strategy = await manager.get_page(run_config, config)
print(f"Crawler {index}: Got page, navigating to example.com...")
try:
await page.goto("https://example.com")
title = await page.title()
print(f"Crawler {index}: Page title: {title}")
# Simulate work
await asyncio.sleep(random.uniform(1, 3))
print(f"Crawler {index}: Work completed, releasing page...")
# Check dynamic page content
content = await page.content()
content_length = len(content)
print(f"Crawler {index}: Page content length: {content_length}")
except Exception as e:
print(f"Crawler {index}: Error: {str(e)}")
finally:
# Release the page
await manager.release_page(page, strategy, config)
print(f"Crawler {index}: Page released")
# Create 5 parallel crawls
crawl_tasks = []
for i in range(5):
# Alternate between configurations
config = config1 if i % 2 == 0 else config2
run_config = run_config1 if i % 2 == 0 else run_config2
task = asyncio.create_task(simulate_crawl(i+1, config, run_config))
crawl_tasks.append(task)
# Wait for all crawls to complete
await asyncio.gather(*crawl_tasks)
# Display final pool status
status = await manager.get_pool_status()
print(f"\nFinal pool status: {status}")
finally:
# Clean up
print("\nClosing browser manager...")
await manager.close()
print("Browser manager closed")
async def prewarm_pages_demo():
"""Demonstrate page pre-warming functionality."""
print("\n=== Page Pre-warming Demo ===")
# Create logger
logger = AsyncLogger(verbose=True)
# Create browser configuration
config = BrowserConfig(
browser_type="chromium",
headless=True,
browser_mode="playwright"
)
# Create crawler run configurations for pre-warming
run_config1 = CrawlerRunConfig(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
run_config2 = CrawlerRunConfig(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15"
)
# Create page pre-warm configurations
page_configs = [
(config, run_config1, 2), # 2 pages with run_config1
(config, run_config2, 3) # 3 pages with run_config2
]
# Create browser manager
manager = BrowserManager(
browser_config=config,
logger=logger,
unavailable_behavior=UnavailableBehavior.EXCEPTION
)
try:
# Initialize pool with pre-warmed pages
print("Initializing browser pool with pre-warmed pages...")
await manager.initialize_pool(
browser_configs=[config],
browsers_per_config=2,
page_configs=page_configs
)
# Display pool status
status = await manager.get_pool_status()
print(f"Pool status after pre-warming: {status}")
# Simulate using pre-warmed pages
print("\nUsing pre-warmed pages...")
async def use_prewarm_page(index: int, run_config: CrawlerRunConfig):
print(f"Task {index}: Requesting pre-warmed page...")
page, context, strategy = await manager.get_page(run_config, config)
try:
print(f"Task {index}: Got page, navigating to example.com...")
await page.goto("https://example.com")
# Verify user agent was applied correctly
user_agent = await page.evaluate("() => navigator.userAgent")
print(f"Task {index}: User agent: {user_agent}")
# Get page title
title = await page.title()
print(f"Task {index}: Page title: {title}")
# Simulate work
await asyncio.sleep(1)
finally:
# Release the page
print(f"Task {index}: Releasing page...")
await manager.release_page(page, strategy, config)
# Create tasks to use pre-warmed pages
tasks = []
# Use run_config1 pages
for i in range(2):
tasks.append(asyncio.create_task(use_prewarm_page(i+1, run_config1)))
# Use run_config2 pages
for i in range(3):
tasks.append(asyncio.create_task(use_prewarm_page(i+3, run_config2)))
# Wait for all tasks to complete
await asyncio.gather(*tasks)
# Try to use more pages than we pre-warmed (should raise exception)
print("\nTrying to use more pages than pre-warmed...")
try:
page, context, strategy = await manager.get_page(run_config1, config)
try:
print("Got extra page (unexpected)")
await page.goto("https://example.com")
finally:
await manager.release_page(page, strategy, config)
except Exception as e:
print(f"Expected exception when requesting more pages: {str(e)}")
finally:
# Clean up
print("\nClosing browser manager...")
await manager.close()
print("Browser manager closed")
async def prewarm_on_demand_demo():
"""Demonstrate pre-warming with on-demand browser creation."""
print("\n=== Pre-warming with On-Demand Browser Creation Demo ===")
# Create logger
logger = AsyncLogger(verbose=True)
# Create browser configuration
config = BrowserConfig(
browser_type="chromium",
headless=True,
browser_mode="playwright"
)
# Create crawler run configurations
run_config = CrawlerRunConfig(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
# Create page pre-warm configurations - just pre-warm 2 pages
page_configs = [
(config, run_config, 2)
]
# Create browser manager with ON_DEMAND behavior
manager = BrowserManager(
browser_config=config,
logger=logger,
unavailable_behavior=UnavailableBehavior.ON_DEMAND,
max_browsers_per_config=5 # Allow up to 5 browsers
)
try:
# Initialize pool with pre-warmed pages
print("Initializing browser pool with pre-warmed pages...")
await manager.initialize_pool(
browser_configs=[config],
browsers_per_config=1, # Start with just 1 browser
page_configs=page_configs
)
# Display initial pool status
status = await manager.get_pool_status()
print(f"Initial pool status: {status}")
# Simulate using more pages than pre-warmed - should create browsers on demand
print("\nUsing more pages than pre-warmed (should create on demand)...")
async def use_page(index: int):
print(f"Task {index}: Requesting page...")
page, context, strategy = await manager.get_page(run_config, config)
try:
print(f"Task {index}: Got page, navigating to example.com...")
await page.goto("https://example.com")
# Get page title
title = await page.title()
print(f"Task {index}: Page title: {title}")
# Simulate work for a varying amount of time
work_time = 1 + (index * 0.5) # Stagger completion times
print(f"Task {index}: Working for {work_time} seconds...")
await asyncio.sleep(work_time)
print(f"Task {index}: Work completed")
finally:
# Release the page
print(f"Task {index}: Releasing page...")
await manager.release_page(page, strategy, config)
# Create more tasks than pre-warmed pages
tasks = []
for i in range(5): # Try to use 5 pages when only 2 are pre-warmed
tasks.append(asyncio.create_task(use_page(i+1)))
# Wait for all tasks to complete
await asyncio.gather(*tasks)
# Display final pool status - should show on-demand created browsers
status = await manager.get_pool_status()
print(f"\nFinal pool status: {status}")
finally:
# Clean up
print("\nClosing browser manager...")
await manager.close()
print("Browser manager closed")
async def high_volume_demo():
"""Demonstrate high-volume access to pre-warmed pages."""
print("\n=== High Volume Pre-warmed Pages Demo ===")
# Create logger
logger = AsyncLogger(verbose=True)
# Create browser configuration
config = BrowserConfig(
browser_type="chromium",
headless=True,
browser_mode="playwright"
)
# Create crawler run configuration
run_config = CrawlerRunConfig(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
# Set up dimensions
browser_count = 10
pages_per_browser = 5
total_pages = browser_count * pages_per_browser
# Create page pre-warm configuration
page_configs = [
(config, run_config, total_pages)
]
print(f"Preparing {browser_count} browsers with {pages_per_browser} pages each ({total_pages} total pages)")
# Create browser manager with ON_DEMAND behavior as fallback
# No need to specify max_browsers_per_config as it will be calculated automatically
manager = BrowserManager(
browser_config=config,
logger=logger,
unavailable_behavior=UnavailableBehavior.ON_DEMAND
)
try:
# Initialize pool with browsers and pre-warmed pages
print(f"Pre-warming {total_pages} pages...")
start_time = time.time()
await manager.initialize_pool(
browser_configs=[config],
browsers_per_config=browser_count,
page_configs=page_configs
)
warmup_time = time.time() - start_time
print(f"Pre-warming completed in {warmup_time:.2f} seconds")
# Display pool status
status = await manager.get_pool_status()
print(f"Pool status after pre-warming: {status}")
# Simulate using all pre-warmed pages simultaneously
print(f"\nSending {total_pages} crawl requests simultaneously...")
async def crawl_page(index: int):
# url = f"https://example.com/page{index}"
url = SAFE_URLS[index % len(SAFE_URLS)]
print(f"Page {index}: Requesting page...")
# Measure time to acquire page
page_start = time.time()
page, context, strategy = await manager.get_page(run_config, config)
page_acquisition_time = time.time() - page_start
try:
# Navigate to the URL
nav_start = time.time()
await page.goto(url, timeout=5000)
navigation_time = time.time() - nav_start
# Get the page title
title = await page.title()
return {
"index": index,
"url": url,
"title": title,
"page_acquisition_time": page_acquisition_time,
"navigation_time": navigation_time
}
except playwright._impl._errors.TimeoutError as e:
# print(f"Page {index}: Navigation timed out - {e}")
return {
"index": index,
"url": url,
"title": "Navigation timed out",
"page_acquisition_time": page_acquisition_time,
"navigation_time": 0
}
finally:
# Release the page
await manager.release_page(page, strategy, config)
# Create and execute all tasks simultaneously
start_time = time.time()
# Non-parallel way
# for i in range(total_pages):
# await crawl_page(i+1)
tasks = [crawl_page(i+1) for i in range(total_pages)]
results = await asyncio.gather(*tasks)
total_time = time.time() - start_time
# # Print all titles
# for result in results:
# print(f"Page {result['index']} ({result['url']}): Title: {result['title']}")
# print(f" Page acquisition time: {result['page_acquisition_time']:.4f}s")
# print(f" Navigation time: {result['navigation_time']:.4f}s")
# print(f" Total time: {result['page_acquisition_time'] + result['navigation_time']:.4f}s")
# print("-" * 40)
# Report results
print(f"\nAll {total_pages} crawls completed in {total_time:.2f} seconds")
# Calculate statistics
acquisition_times = [r["page_acquisition_time"] for r in results]
navigation_times = [r["navigation_time"] for r in results]
avg_acquisition = sum(acquisition_times) / len(acquisition_times)
max_acquisition = max(acquisition_times)
min_acquisition = min(acquisition_times)
avg_navigation = sum(navigation_times) / len(navigation_times)
max_navigation = max(navigation_times)
min_navigation = min(navigation_times)
print("\nPage acquisition times:")
print(f" Average: {avg_acquisition:.4f}s")
print(f" Min: {min_acquisition:.4f}s")
print(f" Max: {max_acquisition:.4f}s")
print("\nPage navigation times:")
print(f" Average: {avg_navigation:.4f}s")
print(f" Min: {min_navigation:.4f}s")
print(f" Max: {max_navigation:.4f}s")
# Display final pool status
status = await manager.get_pool_status()
print(f"\nFinal pool status: {status}")
finally:
# Clean up
print("\nClosing browser manager...")
await manager.close()
print("Browser manager closed")
async def main():
"""Run all demos."""
# await basic_pooling_demo()
# await prewarm_pages_demo()
# await prewarm_on_demand_demo()
await high_volume_demo()
# Additional demo functions can be added here
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,190 @@
"""Test examples for BrowserManager.
These examples demonstrate the functionality of BrowserManager
and serve as functional tests.
"""
import asyncio
import os
import sys
from typing import List
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser import BrowserManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def test_basic_browser_manager():
"""Test basic BrowserManager functionality with default configuration."""
logger.info("Starting test_basic_browser_manager", tag="TEST")
try:
# Create a browser manager with default config
manager = BrowserManager(logger=logger)
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Get a page
crawler_config = CrawlerRunConfig(url="https://example.com")
page, context = await manager.get_page(crawler_config)
logger.info("Page created successfully", tag="TEST")
# Navigate to a website
await page.goto("https://example.com")
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Clean up
await manager.close()
logger.success("test_basic_browser_manager completed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"test_basic_browser_manager failed: {str(e)}", tag="TEST")
return False
async def test_custom_browser_config():
"""Test BrowserManager with custom browser configuration."""
logger.info("Starting test_custom_browser_config", tag="TEST")
try:
# Create a custom browser config
browser_config = BrowserConfig(
browser_type="chromium",
headless=True,
viewport_width=1280,
viewport_height=800,
light_mode=True
)
# Create browser manager with the config
manager = BrowserManager(browser_config=browser_config, logger=logger)
# Start the browser
await manager.start()
logger.info("Browser started successfully with custom config", tag="TEST")
# Get a page
crawler_config = CrawlerRunConfig(url="https://example.com")
page, context = await manager.get_page(crawler_config)
# Navigate to a website
await page.goto("https://example.com")
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Verify viewport size
viewport_size = await page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })")
logger.info(f"Viewport size: {viewport_size}", tag="TEST")
# Clean up
await manager.close()
logger.success("test_custom_browser_config completed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"test_custom_browser_config failed: {str(e)}", tag="TEST")
return False
async def test_multiple_pages():
"""Test BrowserManager with multiple pages."""
logger.info("Starting test_multiple_pages", tag="TEST")
try:
# Create browser manager
manager = BrowserManager(logger=logger)
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Create multiple pages
pages = []
urls = ["https://example.com", "https://example.org", "https://mozilla.org"]
for i, url in enumerate(urls):
crawler_config = CrawlerRunConfig(url=url)
page, context = await manager.get_page(crawler_config)
await page.goto(url)
pages.append((page, url))
logger.info(f"Created page {i+1} for {url}", tag="TEST")
# Verify all pages are loaded correctly
for i, (page, url) in enumerate(pages):
title = await page.title()
logger.info(f"Page {i+1} title: {title}", tag="TEST")
# Clean up
await manager.close()
logger.success("test_multiple_pages completed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"test_multiple_pages failed: {str(e)}", tag="TEST")
return False
async def test_session_management():
"""Test session management in BrowserManager."""
logger.info("Starting test_session_management", tag="TEST")
try:
# Create browser manager
manager = BrowserManager(logger=logger)
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Create a session
session_id = "test_session_1"
crawler_config = CrawlerRunConfig(url="https://example.com", session_id=session_id)
page1, context1 = await manager.get_page(crawler_config)
await page1.goto("https://example.com")
logger.info(f"Created session with ID: {session_id}", tag="TEST")
# Get the same session again
page2, context2 = await manager.get_page(crawler_config)
# Verify it's the same page/context
is_same_page = page1 == page2
is_same_context = context1 == context2
logger.info(f"Same page: {is_same_page}, Same context: {is_same_context}", tag="TEST")
# Kill the session
await manager.kill_session(session_id)
logger.info(f"Killed session with ID: {session_id}", tag="TEST")
# Clean up
await manager.close()
logger.success("test_session_management completed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"test_session_management failed: {str(e)}", tag="TEST")
return False
async def run_tests():
"""Run all tests sequentially."""
results = []
results.append(await test_basic_browser_manager())
results.append(await test_custom_browser_config())
results.append(await test_multiple_pages())
results.append(await test_session_management())
# Print summary
total = len(results)
passed = sum(results)
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
if passed == total:
logger.success("All tests passed!", tag="SUMMARY")
else:
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
if __name__ == "__main__":
asyncio.run(run_tests())

View File

@@ -0,0 +1,809 @@
"""
Test script for builtin browser functionality in the browser module.
This script tests:
1. Creating a builtin browser
2. Getting browser information
3. Killing the browser
4. Restarting the browser
5. Testing operations with different browser strategies
6. Testing edge cases
"""
import asyncio
import os
import sys
import time
from typing import List, Dict, Any
from colorama import Fore, Style, init
# Add the project root to the path for imports
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.text import Text
from rich.box import Box, SIMPLE
from crawl4ai.browser import BrowserManager
from crawl4ai.browser.strategies import BuiltinBrowserStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Initialize colorama for cross-platform colored terminal output
init()
# Define colors for pretty output
SUCCESS = Fore.GREEN
WARNING = Fore.YELLOW
ERROR = Fore.RED
INFO = Fore.CYAN
RESET = Fore.RESET
# Create logger
logger = AsyncLogger(verbose=True)
async def test_builtin_browser_creation():
"""Test creating a builtin browser using the BrowserManager with BuiltinBrowserStrategy"""
print(f"\n{INFO}========== Testing Builtin Browser Creation =========={RESET}")
# Step 1: Create a BrowserManager with builtin mode
print(f"\n{INFO}1. Creating BrowserManager with builtin mode{RESET}")
browser_config = BrowserConfig(browser_mode="builtin", headless=True, verbose=True)
manager = BrowserManager(browser_config=browser_config, logger=logger)
# Step 2: Check if we have a BuiltinBrowserStrategy
print(f"\n{INFO}2. Checking if we have a BuiltinBrowserStrategy{RESET}")
if isinstance(manager.strategy, BuiltinBrowserStrategy):
print(
f"{SUCCESS}Correct strategy type: {manager.strategy.__class__.__name__}{RESET}"
)
else:
print(
f"{ERROR}Wrong strategy type: {manager.strategy.__class__.__name__}{RESET}"
)
return None
# Step 3: Start the manager to launch or connect to builtin browser
print(f"\n{INFO}3. Starting the browser manager{RESET}")
try:
await manager.start()
print(f"{SUCCESS}Browser manager started successfully{RESET}")
except Exception as e:
print(f"{ERROR}Failed to start browser manager: {str(e)}{RESET}")
return None
# Step 4: Get browser info from the strategy
print(f"\n{INFO}4. Getting browser information{RESET}")
browser_info = manager.strategy.get_browser_info()
if browser_info:
print(f"{SUCCESS}Browser info retrieved:{RESET}")
for key, value in browser_info.items():
if key != "config": # Skip the verbose config section
print(f" {key}: {value}")
cdp_url = browser_info.get("cdp_url")
print(f"{SUCCESS}CDP URL: {cdp_url}{RESET}")
else:
print(f"{ERROR}Failed to get browser information{RESET}")
cdp_url = None
# Save manager for later tests
return manager, cdp_url
async def test_page_operations(manager: BrowserManager):
"""Test page operations with the builtin browser"""
print(
f"\n{INFO}========== Testing Page Operations with Builtin Browser =========={RESET}"
)
# Step 1: Get a single page
print(f"\n{INFO}1. Getting a single page{RESET}")
try:
crawler_config = CrawlerRunConfig()
page, context = await manager.get_page(crawler_config)
print(f"{SUCCESS}Got page successfully{RESET}")
# Navigate to a test URL
await page.goto("https://example.com")
title = await page.title()
print(f"{SUCCESS}Page title: {title}{RESET}")
# Close the page
await page.close()
print(f"{SUCCESS}Page closed successfully{RESET}")
except Exception as e:
print(f"{ERROR}Page operation failed: {str(e)}{RESET}")
return False
# Step 2: Get multiple pages
print(f"\n{INFO}2. Getting multiple pages with get_pages(){RESET}")
try:
# Request 3 pages
crawler_config = CrawlerRunConfig()
pages = await manager.get_pages(crawler_config, count=3)
print(f"{SUCCESS}Got {len(pages)} pages{RESET}")
# Test each page
for i, (page, context) in enumerate(pages):
await page.goto(f"https://example.com?test={i}")
title = await page.title()
print(f"{SUCCESS}Page {i + 1} title: {title}{RESET}")
await page.close()
print(f"{SUCCESS}All pages tested and closed successfully{RESET}")
except Exception as e:
print(f"{ERROR}Multiple page operation failed: {str(e)}{RESET}")
return False
return True
async def test_browser_status_management(manager: BrowserManager):
"""Test browser status and management operations"""
print(f"\n{INFO}========== Testing Browser Status and Management =========={RESET}")
# Step 1: Get browser status
print(f"\n{INFO}1. Getting browser status{RESET}")
try:
status = await manager.strategy.get_builtin_browser_status()
print(f"{SUCCESS}Browser status:{RESET}")
print(f" Running: {status['running']}")
print(f" CDP URL: {status['cdp_url']}")
except Exception as e:
print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}")
return False
# Step 2: Test killing the browser
print(f"\n{INFO}2. Testing killing the browser{RESET}")
try:
result = await manager.strategy.kill_builtin_browser()
if result:
print(f"{SUCCESS}Browser killed successfully{RESET}")
else:
print(f"{ERROR}Failed to kill browser{RESET}")
except Exception as e:
print(f"{ERROR}Browser kill operation failed: {str(e)}{RESET}")
return False
# Step 3: Check status after kill
print(f"\n{INFO}3. Checking status after kill{RESET}")
try:
status = await manager.strategy.get_builtin_browser_status()
if not status["running"]:
print(f"{SUCCESS}Browser is correctly reported as not running{RESET}")
else:
print(f"{ERROR}Browser is incorrectly reported as still running{RESET}")
except Exception as e:
print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}")
return False
# Step 4: Launch a new browser
print(f"\n{INFO}4. Launching a new browser{RESET}")
try:
cdp_url = await manager.strategy.launch_builtin_browser(
browser_type="chromium", headless=True
)
if cdp_url:
print(f"{SUCCESS}New browser launched at: {cdp_url}{RESET}")
else:
print(f"{ERROR}Failed to launch new browser{RESET}")
return False
except Exception as e:
print(f"{ERROR}Browser launch failed: {str(e)}{RESET}")
return False
return True
async def test_multiple_managers():
"""Test creating multiple BrowserManagers that use the same builtin browser"""
print(f"\n{INFO}========== Testing Multiple Browser Managers =========={RESET}")
# Step 1: Create first manager
print(f"\n{INFO}1. Creating first browser manager{RESET}")
browser_config1 = BrowserConfig(browser_mode="builtin", headless=True)
manager1 = BrowserManager(browser_config=browser_config1, logger=logger)
# Step 2: Create second manager
print(f"\n{INFO}2. Creating second browser manager{RESET}")
browser_config2 = BrowserConfig(browser_mode="builtin", headless=True)
manager2 = BrowserManager(browser_config=browser_config2, logger=logger)
# Step 3: Start both managers (should connect to the same builtin browser)
print(f"\n{INFO}3. Starting both managers{RESET}")
try:
await manager1.start()
print(f"{SUCCESS}First manager started{RESET}")
await manager2.start()
print(f"{SUCCESS}Second manager started{RESET}")
# Check if they got the same CDP URL
cdp_url1 = manager1.strategy.config.cdp_url
cdp_url2 = manager2.strategy.config.cdp_url
if cdp_url1 == cdp_url2:
print(
f"{SUCCESS}Both managers connected to the same browser: {cdp_url1}{RESET}"
)
else:
print(
f"{WARNING}Managers connected to different browsers: {cdp_url1} and {cdp_url2}{RESET}"
)
except Exception as e:
print(f"{ERROR}Failed to start managers: {str(e)}{RESET}")
return False
# Step 4: Test using both managers
print(f"\n{INFO}4. Testing operations with both managers{RESET}")
try:
# First manager creates a page
page1, ctx1 = await manager1.get_page(CrawlerRunConfig())
await page1.goto("https://example.com")
title1 = await page1.title()
print(f"{SUCCESS}Manager 1 page title: {title1}{RESET}")
# Second manager creates a page
page2, ctx2 = await manager2.get_page(CrawlerRunConfig())
await page2.goto("https://example.org")
title2 = await page2.title()
print(f"{SUCCESS}Manager 2 page title: {title2}{RESET}")
# Clean up
await page1.close()
await page2.close()
except Exception as e:
print(f"{ERROR}Failed to use both managers: {str(e)}{RESET}")
return False
# Step 5: Close both managers
print(f"\n{INFO}5. Closing both managers{RESET}")
try:
await manager1.close()
print(f"{SUCCESS}First manager closed{RESET}")
await manager2.close()
print(f"{SUCCESS}Second manager closed{RESET}")
except Exception as e:
print(f"{ERROR}Failed to close managers: {str(e)}{RESET}")
return False
return True
async def test_edge_cases():
"""Test edge cases like multiple starts, killing browser during operations, etc."""
print(f"\n{INFO}========== Testing Edge Cases =========={RESET}")
# Step 1: Test multiple starts with the same manager
print(f"\n{INFO}1. Testing multiple starts with the same manager{RESET}")
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
print(f"{SUCCESS}First start successful{RESET}")
# Try to start again
await manager.start()
print(f"{SUCCESS}Second start completed without errors{RESET}")
# Test if it's still functional
page, context = await manager.get_page(CrawlerRunConfig())
await page.goto("https://example.com")
title = await page.title()
print(
f"{SUCCESS}Page operations work after multiple starts. Title: {title}{RESET}"
)
await page.close()
except Exception as e:
print(f"{ERROR}Multiple starts test failed: {str(e)}{RESET}")
return False
finally:
await manager.close()
# Step 2: Test killing the browser while manager is active
print(f"\n{INFO}2. Testing killing the browser while manager is active{RESET}")
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
print(f"{SUCCESS}Manager started{RESET}")
# Kill the browser directly
print(f"{INFO}Killing the browser...{RESET}")
await manager.strategy.kill_builtin_browser()
print(f"{SUCCESS}Browser killed{RESET}")
# Try to get a page (should fail or launch a new browser)
try:
page, context = await manager.get_page(CrawlerRunConfig())
print(
f"{WARNING}Page request succeeded despite killed browser (might have auto-restarted){RESET}"
)
title = await page.title()
print(f"{SUCCESS}Got page title: {title}{RESET}")
await page.close()
except Exception as e:
print(
f"{SUCCESS}Page request failed as expected after browser was killed: {str(e)}{RESET}"
)
except Exception as e:
print(f"{ERROR}Kill during operation test failed: {str(e)}{RESET}")
return False
finally:
await manager.close()
return True
async def cleanup_browsers():
"""Clean up any remaining builtin browsers"""
print(f"\n{INFO}========== Cleaning Up Builtin Browsers =========={RESET}")
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
# No need to start, just access the strategy directly
strategy = manager.strategy
if isinstance(strategy, BuiltinBrowserStrategy):
result = await strategy.kill_builtin_browser()
if result:
print(f"{SUCCESS}Successfully killed all builtin browsers{RESET}")
else:
print(f"{WARNING}No builtin browsers found to kill{RESET}")
else:
print(f"{ERROR}Wrong strategy type: {strategy.__class__.__name__}{RESET}")
except Exception as e:
print(f"{ERROR}Cleanup failed: {str(e)}{RESET}")
finally:
# Just to be safe
try:
await manager.close()
except:
pass
async def test_performance_scaling():
"""Test performance with multiple browsers and pages.
This test creates multiple browsers on different ports,
spawns multiple pages per browser, and measures performance metrics.
"""
print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}")
# Configuration parameters
num_browsers = 10
pages_per_browser = 10
total_pages = num_browsers * pages_per_browser
base_port = 9222
# Set up a measuring mechanism for memory
import psutil
import gc
# Force garbage collection before starting
gc.collect()
process = psutil.Process()
initial_memory = process.memory_info().rss / 1024 / 1024 # in MB
peak_memory = initial_memory
# Report initial configuration
print(
f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}"
)
# List to track managers
managers: List[BrowserManager] = []
all_pages = []
# Get crawl4ai home directory
crawl4ai_home = os.path.expanduser("~/.crawl4ai")
temp_dir = os.path.join(crawl4ai_home, "temp")
os.makedirs(temp_dir, exist_ok=True)
# Create all managers but don't start them yet
manager_configs = []
for i in range(num_browsers):
port = base_port + i
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True,
debugging_port=port,
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
manager.strategy.shutting_down = True
manager_configs.append((manager, i, port))
# Define async function to start a single manager
async def start_manager(manager, index, port):
try:
await manager.start()
return manager
except Exception as e:
print(
f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}"
)
return None
# Start all managers in parallel
start_tasks = [
start_manager(manager, i, port) for manager, i, port in manager_configs
]
started_managers = await asyncio.gather(*start_tasks)
# Filter out None values (failed starts) and add to managers list
managers = [m for m in started_managers if m is not None]
if len(managers) == 0:
print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}")
return False
if len(managers) < num_browsers:
print(
f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}"
)
# Create pages for each browser
for i, manager in enumerate(managers):
try:
pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser)
all_pages.extend(pages)
except Exception as e:
print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}")
# Check memory after page creation
gc.collect()
current_memory = process.memory_info().rss / 1024 / 1024
peak_memory = max(peak_memory, current_memory)
# Ask for confirmation before loading
confirmation = input(
f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}"
)
# Step 1: Create and start multiple browser managers in parallel
start_time = time.time()
if confirmation.lower() == "y":
load_start_time = time.time()
# Function to load a single page
async def load_page(page_ctx, index):
page, _ = page_ctx
try:
await page.goto(f"https://example.com/page{index}", timeout=30000)
title = await page.title()
return title
except Exception as e:
return f"Error: {str(e)}"
# Load all pages concurrently
load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)]
load_results = await asyncio.gather(*load_tasks, return_exceptions=True)
# Count successes and failures
successes = sum(
1 for r in load_results if isinstance(r, str) and not r.startswith("Error")
)
failures = len(load_results) - successes
load_time = time.time() - load_start_time
total_test_time = time.time() - start_time
# Check memory after loading (peak memory)
gc.collect()
current_memory = process.memory_info().rss / 1024 / 1024
peak_memory = max(peak_memory, current_memory)
# Calculate key metrics
memory_per_page = peak_memory / successes if successes > 0 else 0
time_per_crawl = total_test_time / successes if successes > 0 else 0
crawls_per_second = successes / total_test_time if total_test_time > 0 else 0
crawls_per_minute = crawls_per_second * 60
crawls_per_hour = crawls_per_minute * 60
# Print simplified performance summary
from rich.console import Console
from rich.table import Table
console = Console()
# Create a simple summary table
table = Table(title="CRAWL4AI PERFORMANCE SUMMARY")
table.add_column("Metric", style="cyan")
table.add_column("Value", style="green")
table.add_row("Total Crawls Completed", f"{successes}")
table.add_row("Total Time", f"{total_test_time:.2f} seconds")
table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds")
table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second")
table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls")
table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls")
table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB")
table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB")
# Display the table
console.print(table)
# Ask confirmation before cleanup
confirmation = input(
f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}"
)
if confirmation.lower() != "y":
print(f"{WARNING}Cleanup aborted by user{RESET}")
return False
# Close all pages
for page, _ in all_pages:
try:
await page.close()
except:
pass
# Close all managers
for manager in managers:
try:
await manager.close()
except:
pass
# Remove the temp directory
import shutil
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
return True
async def test_performance_scaling_lab( num_browsers: int = 10, pages_per_browser: int = 10):
"""Test performance with multiple browsers and pages.
This test creates multiple browsers on different ports,
spawns multiple pages per browser, and measures performance metrics.
"""
print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}")
# Configuration parameters
num_browsers = num_browsers
pages_per_browser = pages_per_browser
total_pages = num_browsers * pages_per_browser
base_port = 9222
# Set up a measuring mechanism for memory
import psutil
import gc
# Force garbage collection before starting
gc.collect()
process = psutil.Process()
initial_memory = process.memory_info().rss / 1024 / 1024 # in MB
peak_memory = initial_memory
# Report initial configuration
print(
f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}"
)
# List to track managers
managers: List[BrowserManager] = []
all_pages = []
# Get crawl4ai home directory
crawl4ai_home = os.path.expanduser("~/.crawl4ai")
temp_dir = os.path.join(crawl4ai_home, "temp")
os.makedirs(temp_dir, exist_ok=True)
# Create all managers but don't start them yet
manager_configs = []
for i in range(num_browsers):
port = base_port + i
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True,
debugging_port=port,
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
manager.strategy.shutting_down = True
manager_configs.append((manager, i, port))
# Define async function to start a single manager
async def start_manager(manager, index, port):
try:
await manager.start()
return manager
except Exception as e:
print(
f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}"
)
return None
# Start all managers in parallel
start_tasks = [
start_manager(manager, i, port) for manager, i, port in manager_configs
]
started_managers = await asyncio.gather(*start_tasks)
# Filter out None values (failed starts) and add to managers list
managers = [m for m in started_managers if m is not None]
if len(managers) == 0:
print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}")
return False
if len(managers) < num_browsers:
print(
f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}"
)
# Create pages for each browser
for i, manager in enumerate(managers):
try:
pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser)
all_pages.extend(pages)
except Exception as e:
print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}")
# Check memory after page creation
gc.collect()
current_memory = process.memory_info().rss / 1024 / 1024
peak_memory = max(peak_memory, current_memory)
# Ask for confirmation before loading
confirmation = input(
f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}"
)
# Step 1: Create and start multiple browser managers in parallel
start_time = time.time()
if confirmation.lower() == "y":
load_start_time = time.time()
# Function to load a single page
async def load_page(page_ctx, index):
page, _ = page_ctx
try:
await page.goto(f"https://example.com/page{index}", timeout=30000)
title = await page.title()
return title
except Exception as e:
return f"Error: {str(e)}"
# Load all pages concurrently
load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)]
load_results = await asyncio.gather(*load_tasks, return_exceptions=True)
# Count successes and failures
successes = sum(
1 for r in load_results if isinstance(r, str) and not r.startswith("Error")
)
failures = len(load_results) - successes
load_time = time.time() - load_start_time
total_test_time = time.time() - start_time
# Check memory after loading (peak memory)
gc.collect()
current_memory = process.memory_info().rss / 1024 / 1024
peak_memory = max(peak_memory, current_memory)
# Calculate key metrics
memory_per_page = peak_memory / successes if successes > 0 else 0
time_per_crawl = total_test_time / successes if successes > 0 else 0
crawls_per_second = successes / total_test_time if total_test_time > 0 else 0
crawls_per_minute = crawls_per_second * 60
crawls_per_hour = crawls_per_minute * 60
# Print simplified performance summary
from rich.console import Console
from rich.table import Table
console = Console()
# Create a simple summary table
table = Table(title="CRAWL4AI PERFORMANCE SUMMARY")
table.add_column("Metric", style="cyan")
table.add_column("Value", style="green")
table.add_row("Total Crawls Completed", f"{successes}")
table.add_row("Total Time", f"{total_test_time:.2f} seconds")
table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds")
table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second")
table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls")
table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls")
table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB")
table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB")
# Display the table
console.print(table)
# Ask confirmation before cleanup
confirmation = input(
f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}"
)
if confirmation.lower() != "y":
print(f"{WARNING}Cleanup aborted by user{RESET}")
return False
# Close all pages
for page, _ in all_pages:
try:
await page.close()
except:
pass
# Close all managers
for manager in managers:
try:
await manager.close()
except:
pass
# Remove the temp directory
import shutil
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)
return True
async def main():
"""Run all tests"""
try:
print(f"{INFO}Starting builtin browser tests with browser module{RESET}")
# # Run browser creation test
# manager, cdp_url = await test_builtin_browser_creation()
# if not manager:
# print(f"{ERROR}Browser creation failed, cannot continue tests{RESET}")
# return
# # Run page operations test
# await test_page_operations(manager)
# # Run browser status and management test
# await test_browser_status_management(manager)
# # Close manager before multiple manager test
# await manager.close()
# Run multiple managers test
await test_multiple_managers()
# Run performance scaling test
await test_performance_scaling()
# Run cleanup test
await cleanup_browsers()
# Run edge cases test
await test_edge_cases()
print(f"\n{SUCCESS}All tests completed!{RESET}")
except Exception as e:
print(f"\n{ERROR}Test failed with error: {str(e)}{RESET}")
import traceback
traceback.print_exc()
finally:
# Clean up: kill any remaining builtin browsers
await cleanup_browsers()
print(f"{SUCCESS}Test cleanup complete{RESET}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,160 @@
"""Test examples for BuiltinBrowserStrategy.
These examples demonstrate the functionality of BuiltinBrowserStrategy
and serve as functional tests.
"""
import asyncio
import os
import sys
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser import BrowserManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def test_builtin_browser():
"""Test using a builtin browser that persists between sessions."""
logger.info("Testing builtin browser", tag="TEST")
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
# Start should connect to existing builtin browser or create one
await manager.start()
logger.info("Connected to builtin browser", tag="TEST")
# Test page creation
crawler_config = CrawlerRunConfig()
page, context = await manager.get_page(crawler_config)
# Test navigation
await page.goto("https://example.com")
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Close manager (should not close the builtin browser)
await manager.close()
logger.info("First session closed", tag="TEST")
# Create a second manager to verify browser persistence
logger.info("Creating second session to verify persistence", tag="TEST")
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
await manager2.start()
logger.info("Connected to existing builtin browser", tag="TEST")
page2, context2 = await manager2.get_page(crawler_config)
await page2.goto("https://example.org")
title2 = await page2.title()
logger.info(f"Second session page title: {title2}", tag="TEST")
await manager2.close()
logger.info("Second session closed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
return False
async def test_builtin_browser_status():
"""Test getting status of the builtin browser."""
logger.info("Testing builtin browser status", tag="TEST")
from crawl4ai.browser.strategies import BuiltinBrowserStrategy
browser_config = BrowserConfig(
browser_mode="builtin",
headless=True
)
# Create strategy directly to access its status methods
strategy = BuiltinBrowserStrategy(browser_config, logger)
try:
# Get status before starting (should be not running)
status_before = await strategy.get_builtin_browser_status()
logger.info(f"Initial status: {status_before}", tag="TEST")
# Start the browser
await strategy.start()
logger.info("Browser started successfully", tag="TEST")
# Get status after starting
status_after = await strategy.get_builtin_browser_status()
logger.info(f"Status after start: {status_after}", tag="TEST")
# Create a page to verify functionality
crawler_config = CrawlerRunConfig()
page, context = await strategy.get_page(crawler_config)
await page.goto("https://example.com")
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Close strategy (should not kill the builtin browser)
await strategy.close()
logger.info("Strategy closed successfully", tag="TEST")
# Create a new strategy object
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
# Get status again (should still be running)
status_final = await strategy2.get_builtin_browser_status()
logger.info(f"Final status: {status_final}", tag="TEST")
# Verify that the status shows the browser is running
is_running = status_final.get('running', False)
logger.info(f"Builtin browser persistence confirmed: {is_running}", tag="TEST")
# Kill the builtin browser to clean up
logger.info("Killing builtin browser", tag="TEST")
success = await strategy2.kill_builtin_browser()
logger.info(f"Killed builtin browser successfully: {success}", tag="TEST")
return is_running and success
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await strategy.close()
# Try to kill the builtin browser to clean up
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
await strategy2.kill_builtin_browser()
except:
pass
return False
async def run_tests():
"""Run all tests sequentially."""
results = []
results.append(await test_builtin_browser())
results.append(await test_builtin_browser_status())
# Print summary
total = len(results)
passed = sum(results)
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
if passed == total:
logger.success("All tests passed!", tag="SUMMARY")
else:
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
if __name__ == "__main__":
asyncio.run(run_tests())

View File

@@ -0,0 +1,228 @@
"""Test examples for CDPBrowserStrategy.
These examples demonstrate the functionality of CDPBrowserStrategy
and serve as functional tests.
"""
import asyncio
import os
import sys
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser import BrowserManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def test_cdp_launch_connect():
"""Test launching a browser and connecting via CDP."""
logger.info("Testing launch and connect via CDP", tag="TEST")
browser_config = BrowserConfig(
use_managed_browser=True,
browser_mode="cdp",
headless=True
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
logger.info("Browser launched and connected via CDP", tag="TEST")
# Test with multiple pages
pages = []
for i in range(3):
crawler_config = CrawlerRunConfig()
page, context = await manager.get_page(crawler_config)
await page.goto(f"https://example.com?test={i}")
pages.append(page)
logger.info(f"Created page {i+1}", tag="TEST")
# Verify all pages are working
for i, page in enumerate(pages):
title = await page.title()
logger.info(f"Page {i+1} title: {title}", tag="TEST")
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
return False
async def test_cdp_with_user_data_dir():
"""Test CDP browser with a user data directory."""
logger.info("Testing CDP browser with user data directory", tag="TEST")
# Create a temporary user data directory
import tempfile
user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-test-")
logger.info(f"Created temporary user data directory: {user_data_dir}", tag="TEST")
browser_config = BrowserConfig(
headless=True,
browser_mode="cdp",
user_data_dir=user_data_dir
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
logger.info("Browser launched with user data directory", tag="TEST")
# Navigate to a page and store some data
crawler_config = CrawlerRunConfig()
page, context = await manager.get_page(crawler_config)
# Set a cookie
await context.add_cookies([{
"name": "test_cookie",
"value": "test_value",
"url": "https://example.com"
}])
# Visit the site
await page.goto("https://example.com")
# Verify cookie was set
cookies = await context.cookies(["https://example.com"])
has_test_cookie = any(cookie["name"] == "test_cookie" for cookie in cookies)
logger.info(f"Cookie set successfully: {has_test_cookie}", tag="TEST")
# Close the browser
await manager.close()
logger.info("First browser session closed", tag="TEST")
# Start a new browser with the same user data directory
logger.info("Starting second browser session with same user data directory", tag="TEST")
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
await manager2.start()
# Get a new page and check if the cookie persists
page2, context2 = await manager2.get_page(crawler_config)
await page2.goto("https://example.com")
# Verify cookie persisted
cookies2 = await context2.cookies(["https://example.com"])
has_test_cookie2 = any(cookie["name"] == "test_cookie" for cookie in cookies2)
logger.info(f"Cookie persisted across sessions: {has_test_cookie2}", tag="TEST")
# Clean up
await manager2.close()
# Remove temporary directory
import shutil
shutil.rmtree(user_data_dir, ignore_errors=True)
logger.info(f"Removed temporary user data directory", tag="TEST")
return has_test_cookie and has_test_cookie2
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
# Clean up temporary directory
try:
import shutil
shutil.rmtree(user_data_dir, ignore_errors=True)
except:
pass
return False
async def test_cdp_session_management():
"""Test session management with CDP browser."""
logger.info("Testing session management with CDP browser", tag="TEST")
browser_config = BrowserConfig(
use_managed_browser=True,
headless=True
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
logger.info("Browser launched successfully", tag="TEST")
# Create two sessions
session1_id = "test_session_1"
session2_id = "test_session_2"
# Set up first session
crawler_config1 = CrawlerRunConfig(session_id=session1_id)
page1, context1 = await manager.get_page(crawler_config1)
await page1.goto("https://example.com")
await page1.evaluate("localStorage.setItem('session1_data', 'test_value')")
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
# Set up second session
crawler_config2 = CrawlerRunConfig(session_id=session2_id)
page2, context2 = await manager.get_page(crawler_config2)
await page2.goto("https://example.org")
await page2.evaluate("localStorage.setItem('session2_data', 'test_value2')")
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
# Get first session again
page1_again, _ = await manager.get_page(crawler_config1)
# Verify it's the same page and data persists
is_same_page = page1 == page1_again
data1 = await page1_again.evaluate("localStorage.getItem('session1_data')")
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
# Kill first session
await manager.kill_session(session1_id)
logger.info(f"Killed session 1", tag="TEST")
# Verify second session still works
data2 = await page2.evaluate("localStorage.getItem('session2_data')")
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return is_same_page and data1 == "test_value" and data2 == "test_value2"
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
return False
async def run_tests():
"""Run all tests sequentially."""
results = []
# results.append(await test_cdp_launch_connect())
results.append(await test_cdp_with_user_data_dir())
results.append(await test_cdp_session_management())
# Print summary
total = len(results)
passed = sum(results)
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
if passed == total:
logger.success("All tests passed!", tag="SUMMARY")
else:
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
if __name__ == "__main__":
asyncio.run(run_tests())

View File

@@ -0,0 +1,77 @@
"""Combined test runner for all browser module tests.
This script runs all the browser module tests in sequence and
provides a comprehensive summary.
"""
import asyncio
import os
import sys
import time
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def run_test_module(module_name, header):
"""Run all tests in a module and return results."""
logger.info(f"\n{'-'*30}", tag="TEST")
logger.info(f"RUNNING: {header}", tag="TEST")
logger.info(f"{'-'*30}", tag="TEST")
# Import the module dynamically
module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"])
# Track time for performance measurement
start_time = time.time()
# Run the tests
await module.run_tests()
# Calculate time taken
time_taken = time.time() - start_time
logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING")
return time_taken
async def main():
"""Run all test modules."""
logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN")
# List of test modules to run
test_modules = [
("test_browser_manager", "Browser Manager Tests"),
("test_playwright_strategy", "Playwright Strategy Tests"),
("test_cdp_strategy", "CDP Strategy Tests"),
("test_builtin_strategy", "Builtin Browser Strategy Tests"),
("test_profiles", "Profile Management Tests")
]
# Run each test module
timings = {}
for module_name, header in test_modules:
try:
time_taken = await run_test_module(module_name, header)
timings[module_name] = time_taken
except Exception as e:
logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR")
# Print summary
logger.info("\n\nTEST SUMMARY:", tag="SUMMARY")
logger.info(f"{'-'*50}", tag="SUMMARY")
for module_name, header in test_modules:
if module_name in timings:
logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY")
else:
logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY")
logger.info(f"{'-'*50}", tag="SUMMARY")
total_time = sum(timings.values())
logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,17 @@
from crawl4ai.browser_profiler import BrowserProfiler
import asyncio
if __name__ == "__main__":
# Test launching a standalone browser
async def test_standalone_browser():
profiler = BrowserProfiler()
cdp_url = await profiler.launch_standalone_browser(
browser_type="chromium",
user_data_dir="~/.crawl4ai/browser_profile/test-browser-data",
debugging_port=9222,
headless=False
)
print(f"CDP URL: {cdp_url}")
asyncio.run(test_standalone_browser())

View File

@@ -0,0 +1,902 @@
"""
Test examples for parallel crawling with the browser module.
These examples demonstrate the functionality of parallel page creation
and serve as functional tests for multi-page crawling performance.
"""
import asyncio
import os
import sys
import time
from typing import List
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser import BrowserManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def test_get_pages_basic():
"""Test basic functionality of get_pages method."""
logger.info("Testing basic get_pages functionality", tag="TEST")
browser_config = BrowserConfig(headless=True)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
# Request 3 pages
crawler_config = CrawlerRunConfig()
pages = await manager.get_pages(crawler_config, count=3)
# Verify we got the correct number of pages
assert len(pages) == 3, f"Expected 3 pages, got {len(pages)}"
# Verify each page is valid
for i, (page, context) in enumerate(pages):
await page.goto("https://example.com")
title = await page.title()
logger.info(f"Page {i+1} title: {title}", tag="TEST")
assert title, f"Page {i+1} has no title"
await manager.close()
logger.success("Basic get_pages test completed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
return False
async def test_parallel_approaches_comparison():
"""Compare two parallel crawling approaches:
1. Create a page for each URL on-demand (get_page + gather)
2. Get all pages upfront with get_pages, then use them (get_pages + gather)
"""
logger.info("Comparing different parallel crawling approaches", tag="TEST")
urls = [
"https://example.com/page1",
"https://crawl4ai.com",
"https://kidocode.com",
"https://bbc.com",
# "https://example.com/page1",
# "https://example.com/page2",
# "https://example.com/page3",
# "https://example.com/page4",
]
browser_config = BrowserConfig(headless=False)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
# Approach 1: Create a page for each URL on-demand and run in parallel
logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
start_time = time.time()
async def fetch_title_approach1(url):
"""Create a new page for each URL, go to the URL, and get title"""
crawler_config = CrawlerRunConfig(url=url)
page, context = await manager.get_page(crawler_config)
try:
await page.goto(url)
title = await page.title()
return title
finally:
await page.close()
# Run fetch_title_approach1 for each URL in parallel
tasks = [fetch_title_approach1(url) for url in urls]
approach1_results = await asyncio.gather(*tasks)
approach1_time = time.time() - start_time
logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
# Approach 2: Get all pages upfront with get_pages, then use them in parallel
logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
start_time = time.time()
# Get all pages upfront
crawler_config = CrawlerRunConfig()
pages = await manager.get_pages(crawler_config, count=len(urls))
async def fetch_title_approach2(page_ctx, url):
"""Use a pre-created page to go to URL and get title"""
page, _ = page_ctx
try:
await page.goto(url)
title = await page.title()
return title
finally:
await page.close()
# Use the pre-created pages to fetch titles in parallel
tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in zip(pages, urls)]
approach2_results = await asyncio.gather(*tasks)
approach2_time = time.time() - start_time
logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
# Compare results and performance
speedup = approach1_time / approach2_time if approach2_time > 0 else 0
if speedup > 1:
logger.success(f"Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
else:
logger.info(f"Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
# Verify same content was retrieved in both approaches
assert len(approach1_results) == len(approach2_results), "Result count mismatch"
# Sort results for comparison since parallel execution might complete in different order
assert sorted(approach1_results) == sorted(approach2_results), "Results content mismatch"
await manager.close()
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
return False
async def test_multi_browser_scaling(num_browsers=3, pages_per_browser=5):
"""Test performance with multiple browsers and pages per browser.
Compares two approaches:
1. On-demand page creation (get_page + gather)
2. Pre-created pages (get_pages + gather)
"""
logger.info(f"Testing multi-browser scaling with {num_browsers} browsers × {pages_per_browser} pages", tag="TEST")
# Generate test URLs
total_pages = num_browsers * pages_per_browser
urls = [f"https://example.com/page_{i}" for i in range(total_pages)]
# Create browser managers
managers = []
base_port = 9222
try:
# Start all browsers in parallel
start_tasks = []
for i in range(num_browsers):
browser_config = BrowserConfig(
headless=True # Using default browser mode like in test_parallel_approaches_comparison
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
start_tasks.append(manager.start())
managers.append(manager)
await asyncio.gather(*start_tasks)
# Distribute URLs among managers
urls_per_manager = {}
for i, manager in enumerate(managers):
start_idx = i * pages_per_browser
end_idx = min(start_idx + pages_per_browser, len(urls))
urls_per_manager[manager] = urls[start_idx:end_idx]
# Approach 1: Create a page for each URL on-demand and run in parallel
logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
start_time = time.time()
async def fetch_title_approach1(manager, url):
"""Create a new page for the URL, go to the URL, and get title"""
crawler_config = CrawlerRunConfig(url=url)
page, context = await manager.get_page(crawler_config)
try:
await page.goto(url)
title = await page.title()
return title
finally:
await page.close()
# Run fetch_title_approach1 for each URL in parallel
tasks = []
for manager, manager_urls in urls_per_manager.items():
for url in manager_urls:
tasks.append(fetch_title_approach1(manager, url))
approach1_results = await asyncio.gather(*tasks)
approach1_time = time.time() - start_time
logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
# Approach 2: Get all pages upfront with get_pages, then use them in parallel
logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
start_time = time.time()
# Get all pages upfront for each manager
all_pages = []
for manager, manager_urls in urls_per_manager.items():
crawler_config = CrawlerRunConfig()
pages = await manager.get_pages(crawler_config, count=len(manager_urls))
all_pages.extend(zip(pages, manager_urls))
async def fetch_title_approach2(page_ctx, url):
"""Use a pre-created page to go to URL and get title"""
page, _ = page_ctx
try:
await page.goto(url)
title = await page.title()
return title
finally:
await page.close()
# Use the pre-created pages to fetch titles in parallel
tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in all_pages]
approach2_results = await asyncio.gather(*tasks)
approach2_time = time.time() - start_time
logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
# Compare results and performance
speedup = approach1_time / approach2_time if approach2_time > 0 else 0
pages_per_second = total_pages / approach2_time
# Show a simple summary
logger.info(f"📊 Summary: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls", tag="TEST")
logger.info(f"⚡ Performance: {pages_per_second:.1f} pages/second ({pages_per_second*60:.0f} pages/minute)", tag="TEST")
logger.info(f"🚀 Total crawl time: {approach2_time:.2f} seconds", tag="TEST")
if speedup > 1:
logger.success(f"✅ Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
else:
logger.info(f"✅ Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
# Close all managers
for manager in managers:
await manager.close()
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Clean up
for manager in managers:
try:
await manager.close()
except:
pass
return False
async def grid_search_optimal_configuration(total_urls=50):
"""Perform a grid search to find the optimal balance between number of browsers and pages per browser.
This function tests different combinations of browser count and pages per browser,
while keeping the total number of URLs constant. It measures performance metrics
for each configuration to find the "sweet spot" that provides the best speed
with reasonable memory usage.
Args:
total_urls: Total number of URLs to crawl (default: 50)
"""
logger.info(f"=== GRID SEARCH FOR OPTIMAL CRAWLING CONFIGURATION ({total_urls} URLs) ===", tag="TEST")
# Generate test URLs once
urls = [f"https://example.com/page_{i}" for i in range(total_urls)]
# Define grid search configurations
# We'll use more flexible approach: test all browser counts from 1 to min(20, total_urls)
# and distribute pages evenly (some browsers may have 1 more page than others)
configurations = []
# Maximum number of browsers to test
max_browsers_to_test = min(20, total_urls)
# Try configurations with 1 to max_browsers_to_test browsers
for num_browsers in range(1, max_browsers_to_test + 1):
base_pages_per_browser = total_urls // num_browsers
remainder = total_urls % num_browsers
# Generate exact page distribution array
if remainder > 0:
# First 'remainder' browsers get one more page
page_distribution = [base_pages_per_browser + 1] * remainder + [base_pages_per_browser] * (num_browsers - remainder)
pages_distribution = f"{base_pages_per_browser+1} pages × {remainder} browsers, {base_pages_per_browser} pages × {num_browsers - remainder} browsers"
else:
# All browsers get the same number of pages
page_distribution = [base_pages_per_browser] * num_browsers
pages_distribution = f"{base_pages_per_browser} pages × {num_browsers} browsers"
# Format the distribution as a tuple string like (4, 4, 3, 3)
distribution_str = str(tuple(page_distribution))
configurations.append((num_browsers, base_pages_per_browser, pages_distribution, page_distribution, distribution_str))
# Track results
results = []
# Test each configuration
for num_browsers, pages_per_browser, pages_distribution, page_distribution, distribution_str in configurations:
logger.info("-" * 80, tag="TEST")
logger.info(f"Testing configuration: {num_browsers} browsers with distribution: {distribution_str}", tag="TEST")
logger.info(f"Details: {pages_distribution}", tag="TEST")
# Sleep a bit for randomness
await asyncio.sleep(0.5)
try:
# Import psutil for memory tracking
try:
import psutil
process = psutil.Process()
initial_memory = process.memory_info().rss / (1024 * 1024) # MB
except ImportError:
logger.warning("psutil not available, memory metrics will not be tracked", tag="TEST")
initial_memory = 0
# Create and start browser managers
managers = []
start_time = time.time()
# Start all browsers in parallel
start_tasks = []
for i in range(num_browsers):
browser_config = BrowserConfig(
headless=True
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
start_tasks.append(manager.start())
managers.append(manager)
await asyncio.gather(*start_tasks)
browser_startup_time = time.time() - start_time
# Measure memory after browser startup
if initial_memory > 0:
browser_memory = process.memory_info().rss / (1024 * 1024) - initial_memory
else:
browser_memory = 0
# Distribute URLs among managers using the exact page distribution
urls_per_manager = {}
total_assigned = 0
for i, manager in enumerate(managers):
if i < len(page_distribution):
# Get the exact number of pages for this browser from our distribution
manager_pages = page_distribution[i]
# Get the URL slice for this manager
start_idx = total_assigned
end_idx = start_idx + manager_pages
urls_per_manager[manager] = urls[start_idx:end_idx]
total_assigned += manager_pages
else:
# If we have more managers than our distribution (should never happen)
urls_per_manager[manager] = []
# Use the more efficient approach (pre-created pages)
logger.info("Running page crawling test...", tag="TEST")
crawl_start_time = time.time()
# Get all pages upfront for each manager
all_pages = []
for manager, manager_urls in urls_per_manager.items():
if not manager_urls: # Skip managers with no URLs
continue
crawler_config = CrawlerRunConfig()
pages = await manager.get_pages(crawler_config, count=len(manager_urls))
all_pages.extend(zip(pages, manager_urls))
# Measure memory after page creation
if initial_memory > 0:
pages_memory = process.memory_info().rss / (1024 * 1024) - browser_memory - initial_memory
else:
pages_memory = 0
# Function to crawl a URL with a pre-created page
async def fetch_title(page_ctx, url):
page, _ = page_ctx
try:
await page.goto(url)
title = await page.title()
return title
finally:
await page.close()
# Use the pre-created pages to fetch titles in parallel
tasks = [fetch_title(page_ctx, url) for page_ctx, url in all_pages]
crawl_results = await asyncio.gather(*tasks)
crawl_time = time.time() - crawl_start_time
total_time = time.time() - start_time
# Final memory measurement
if initial_memory > 0:
peak_memory = max(browser_memory + pages_memory, process.memory_info().rss / (1024 * 1024) - initial_memory)
else:
peak_memory = 0
# Close all managers
for manager in managers:
await manager.close()
# Calculate metrics
pages_per_second = total_urls / crawl_time
# Store result metrics
result = {
"num_browsers": num_browsers,
"pages_per_browser": pages_per_browser,
"page_distribution": page_distribution,
"distribution_str": distribution_str,
"total_urls": total_urls,
"browser_startup_time": browser_startup_time,
"crawl_time": crawl_time,
"total_time": total_time,
"browser_memory": browser_memory,
"pages_memory": pages_memory,
"peak_memory": peak_memory,
"pages_per_second": pages_per_second,
# Calculate efficiency score (higher is better)
# This balances speed vs memory usage
"efficiency_score": pages_per_second / (peak_memory + 1) if peak_memory > 0 else pages_per_second,
}
results.append(result)
# Log the results
logger.info(f"Browser startup: {browser_startup_time:.2f}s", tag="TEST")
logger.info(f"Crawl time: {crawl_time:.2f}s", tag="TEST")
logger.info(f"Total time: {total_time:.2f}s", tag="TEST")
logger.info(f"Performance: {pages_per_second:.1f} pages/second", tag="TEST")
if peak_memory > 0:
logger.info(f"Browser memory: {browser_memory:.1f}MB", tag="TEST")
logger.info(f"Pages memory: {pages_memory:.1f}MB", tag="TEST")
logger.info(f"Peak memory: {peak_memory:.1f}MB", tag="TEST")
logger.info(f"Efficiency score: {result['efficiency_score']:.6f}", tag="TEST")
except Exception as e:
logger.error(f"Error testing configuration: {str(e)}", tag="TEST")
import traceback
traceback.print_exc()
# Clean up
for manager in managers:
try:
await manager.close()
except:
pass
# Print summary of all configurations
logger.info("=" * 100, tag="TEST")
logger.info("GRID SEARCH RESULTS SUMMARY", tag="TEST")
logger.info("=" * 100, tag="TEST")
# Rank configurations by efficiency score
ranked_results = sorted(results, key=lambda x: x["efficiency_score"], reverse=True)
# Also determine rankings by different metrics
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
lowest_memory = sorted(results, key=lambda x: x["peak_memory"] if x["peak_memory"] > 0 else float('inf'))[0]
most_efficient = ranked_results[0]
# Print top performers by category
logger.info("🏆 TOP PERFORMERS BY CATEGORY:", tag="TEST")
logger.info(f"⚡ Fastest: {fastest['num_browsers']} browsers × ~{fastest['pages_per_browser']} pages " +
f"({fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/s)", tag="TEST")
if lowest_memory["peak_memory"] > 0:
logger.info(f"💾 Lowest memory: {lowest_memory['num_browsers']} browsers × ~{lowest_memory['pages_per_browser']} pages " +
f"({lowest_memory['peak_memory']:.1f}MB)", tag="TEST")
logger.info(f"🌟 Most efficient: {most_efficient['num_browsers']} browsers × ~{most_efficient['pages_per_browser']} pages " +
f"(score: {most_efficient['efficiency_score']:.6f})", tag="TEST")
# Print result table header
logger.info("\n📊 COMPLETE RANKING TABLE (SORTED BY EFFICIENCY SCORE):", tag="TEST")
logger.info("-" * 120, tag="TEST")
# Define table header
header = f"{'Rank':<5} | {'Browsers':<8} | {'Distribution':<55} | {'Total Time(s)':<12} | {'Speed(p/s)':<12} | {'Memory(MB)':<12} | {'Efficiency':<10} | {'Notes'}"
logger.info(header, tag="TEST")
logger.info("-" * 120, tag="TEST")
# Print each configuration in ranked order
for rank, result in enumerate(ranked_results, 1):
# Add special notes for top performers
notes = []
if result == fastest:
notes.append("⚡ Fastest")
if result == lowest_memory:
notes.append("💾 Lowest Memory")
if result == most_efficient:
notes.append("🌟 Most Efficient")
notes_str = " | ".join(notes) if notes else ""
# Format memory if available
memory_str = f"{result['peak_memory']:.1f}" if result['peak_memory'] > 0 else "N/A"
# Get the distribution string
dist_str = result.get('distribution_str', str(tuple([result['pages_per_browser']] * result['num_browsers'])))
# Build the row
row = f"{rank:<5} | {result['num_browsers']:<8} | {dist_str:<55} | {result['total_time']:.2f}s{' ':<7} | "
row += f"{result['pages_per_second']:.2f}{' ':<6} | {memory_str}{' ':<6} | {result['efficiency_score']:.4f}{' ':<4} | {notes_str}"
logger.info(row, tag="TEST")
logger.info("-" * 120, tag="TEST")
# Generate visualization if matplotlib is available
try:
import matplotlib.pyplot as plt
import numpy as np
# Extract data for plotting from ranked results
browser_counts = [r["num_browsers"] for r in ranked_results]
efficiency_scores = [r["efficiency_score"] for r in ranked_results]
crawl_times = [r["crawl_time"] for r in ranked_results]
total_times = [r["total_time"] for r in ranked_results]
# Filter results with memory data
memory_results = [r for r in ranked_results if r["peak_memory"] > 0]
memory_browser_counts = [r["num_browsers"] for r in memory_results]
peak_memories = [r["peak_memory"] for r in memory_results]
# Create figure with clean design
plt.figure(figsize=(14, 12), facecolor='white')
plt.style.use('ggplot')
# Create grid for subplots
gs = plt.GridSpec(3, 1, height_ratios=[1, 1, 1], hspace=0.3)
# Plot 1: Efficiency Score (higher is better)
ax1 = plt.subplot(gs[0])
bar_colors = ['#3498db'] * len(browser_counts)
# Highlight the most efficient
most_efficient_idx = browser_counts.index(most_efficient["num_browsers"])
bar_colors[most_efficient_idx] = '#e74c3c' # Red for most efficient
bars = ax1.bar(range(len(browser_counts)), efficiency_scores, color=bar_colors)
ax1.set_xticks(range(len(browser_counts)))
ax1.set_xticklabels([f"{bc}" for bc in browser_counts], rotation=45)
ax1.set_xlabel('Number of Browsers')
ax1.set_ylabel('Efficiency Score (higher is better)')
ax1.set_title('Browser Configuration Efficiency (higher is better)')
# Add value labels on top of bars
for bar, score in zip(bars, efficiency_scores):
height = bar.get_height()
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02*max(efficiency_scores),
f'{score:.3f}', ha='center', va='bottom', rotation=90, fontsize=8)
# Highlight best configuration
ax1.text(0.02, 0.90, f"🌟 Most Efficient: {most_efficient['num_browsers']} browsers with ~{most_efficient['pages_per_browser']} pages",
transform=ax1.transAxes, fontsize=12, verticalalignment='top',
bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.3))
# Plot 2: Time Performance
ax2 = plt.subplot(gs[1])
# Plot both total time and crawl time
ax2.plot(browser_counts, crawl_times, 'bo-', label='Crawl Time (s)', linewidth=2)
ax2.plot(browser_counts, total_times, 'go--', label='Total Time (s)', linewidth=2, alpha=0.6)
# Mark the fastest configuration
fastest_idx = browser_counts.index(fastest["num_browsers"])
ax2.plot(browser_counts[fastest_idx], crawl_times[fastest_idx], 'ro', ms=10,
label=f'Fastest: {fastest["num_browsers"]} browsers')
ax2.set_xlabel('Number of Browsers')
ax2.set_ylabel('Time (seconds)')
ax2.set_title(f'Time Performance for {total_urls} URLs by Browser Count')
ax2.grid(True, linestyle='--', alpha=0.7)
ax2.legend(loc='upper right')
# Plot pages per second on second y-axis
pages_per_second = [total_urls/t for t in crawl_times]
ax2_twin = ax2.twinx()
ax2_twin.plot(browser_counts, pages_per_second, 'r^--', label='Pages/second', alpha=0.5)
ax2_twin.set_ylabel('Pages per second')
# Add note about the fastest configuration
ax2.text(0.02, 0.90, f"⚡ Fastest: {fastest['num_browsers']} browsers with ~{fastest['pages_per_browser']} pages" +
f"\n {fastest['crawl_time']:.2f}s ({fastest['pages_per_second']:.1f} pages/s)",
transform=ax2.transAxes, fontsize=12, verticalalignment='top',
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.3))
# Plot 3: Memory Usage (if available)
if memory_results:
ax3 = plt.subplot(gs[2])
# Prepare data for grouped bar chart
memory_per_browser = [m/n for m, n in zip(peak_memories, memory_browser_counts)]
memory_per_page = [m/(n*p) for m, n, p in zip(
[r["peak_memory"] for r in memory_results],
[r["num_browsers"] for r in memory_results],
[r["pages_per_browser"] for r in memory_results])]
x = np.arange(len(memory_browser_counts))
width = 0.35
# Create grouped bars
ax3.bar(x - width/2, peak_memories, width, label='Total Memory (MB)', color='#9b59b6')
ax3.bar(x + width/2, memory_per_browser, width, label='Memory per Browser (MB)', color='#3498db')
# Configure axis
ax3.set_xticks(x)
ax3.set_xticklabels([f"{bc}" for bc in memory_browser_counts], rotation=45)
ax3.set_xlabel('Number of Browsers')
ax3.set_ylabel('Memory (MB)')
ax3.set_title('Memory Usage by Browser Configuration')
ax3.legend(loc='upper left')
ax3.grid(True, linestyle='--', alpha=0.7)
# Add second y-axis for memory per page
ax3_twin = ax3.twinx()
ax3_twin.plot(x, memory_per_page, 'ro-', label='Memory per Page (MB)')
ax3_twin.set_ylabel('Memory per Page (MB)')
# Get lowest memory configuration
lowest_memory_idx = memory_browser_counts.index(lowest_memory["num_browsers"])
# Add note about lowest memory configuration
ax3.text(0.02, 0.90, f"💾 Lowest Memory: {lowest_memory['num_browsers']} browsers with ~{lowest_memory['pages_per_browser']} pages" +
f"\n {lowest_memory['peak_memory']:.1f}MB ({lowest_memory['peak_memory']/total_urls:.2f}MB per page)",
transform=ax3.transAxes, fontsize=12, verticalalignment='top',
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgreen', alpha=0.3))
# Add overall title
plt.suptitle(f'Browser Scaling Grid Search Results for {total_urls} URLs', fontsize=16, y=0.98)
# Add timestamp and info at the bottom
plt.figtext(0.5, 0.01, f"Generated by Crawl4AI at {time.strftime('%Y-%m-%d %H:%M:%S')}",
ha="center", fontsize=10, style='italic')
# Get current directory and save the figure there
import os
__current_file = os.path.abspath(__file__)
current_dir = os.path.dirname(__current_file)
output_file = os.path.join(current_dir, 'browser_scaling_grid_search.png')
# Adjust layout and save figure with high DPI
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.savefig(output_file, dpi=200, bbox_inches='tight')
logger.success(f"Visualization saved to {output_file}", tag="TEST")
except ImportError:
logger.warning("matplotlib not available, skipping visualization", tag="TEST")
return most_efficient["num_browsers"], most_efficient["pages_per_browser"]
async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
"""Find optimal browser configuration for crawling a specific number of URLs.
Args:
total_urls: Number of URLs to crawl
verbose: Whether to print progress
rate_limit_delay: Delay between page loads to avoid rate limiting
Returns:
dict: Contains fastest, lowest_memory, and optimal configurations
"""
if verbose:
print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
# Generate test URLs with timestamp to avoid caching
timestamp = int(time.time())
urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
# Limit browser configurations to test (1 browser to max 10)
max_browsers = min(10, total_urls)
configs_to_test = []
# Generate configurations (browser count, pages distribution)
for num_browsers in range(1, max_browsers + 1):
base_pages = total_urls // num_browsers
remainder = total_urls % num_browsers
# Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
if remainder > 0:
distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
else:
distribution = [base_pages] * num_browsers
configs_to_test.append((num_browsers, distribution))
results = []
# Test each configuration
for browser_count, page_distribution in configs_to_test:
if verbose:
print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
try:
# Track memory if possible
try:
import psutil
process = psutil.Process()
start_memory = process.memory_info().rss / (1024 * 1024) # MB
except ImportError:
if verbose:
print("Memory tracking not available (psutil not installed)")
start_memory = 0
# Start browsers in parallel
managers = []
start_tasks = []
start_time = time.time()
for i in range(browser_count):
config = BrowserConfig(headless=True)
manager = BrowserManager(browser_config=config, logger=logger)
start_tasks.append(manager.start())
managers.append(manager)
await asyncio.gather(*start_tasks)
# Distribute URLs among browsers
urls_per_manager = {}
url_index = 0
for i, manager in enumerate(managers):
pages_for_this_browser = page_distribution[i]
end_index = url_index + pages_for_this_browser
urls_per_manager[manager] = urls[url_index:end_index]
url_index = end_index
# Create pages for each browser
all_pages = []
for manager, manager_urls in urls_per_manager.items():
if not manager_urls:
continue
pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
all_pages.extend(zip(pages, manager_urls))
# Crawl pages with delay to avoid rate limiting
async def crawl_page(page_ctx, url):
page, _ = page_ctx
try:
await page.goto(url)
if rate_limit_delay > 0:
await asyncio.sleep(rate_limit_delay)
title = await page.title()
return title
finally:
await page.close()
crawl_start = time.time()
crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
await asyncio.gather(*crawl_tasks)
crawl_time = time.time() - crawl_start
total_time = time.time() - start_time
# Measure final memory usage
if start_memory > 0:
end_memory = process.memory_info().rss / (1024 * 1024)
memory_used = end_memory - start_memory
else:
memory_used = 0
# Close all browsers
for manager in managers:
await manager.close()
# Calculate metrics
pages_per_second = total_urls / crawl_time
# Calculate efficiency score (higher is better)
# This balances speed vs memory
if memory_used > 0:
efficiency = pages_per_second / (memory_used + 1)
else:
efficiency = pages_per_second
# Store result
result = {
"browser_count": browser_count,
"distribution": tuple(page_distribution),
"crawl_time": crawl_time,
"total_time": total_time,
"memory_used": memory_used,
"pages_per_second": pages_per_second,
"efficiency": efficiency
}
results.append(result)
if verbose:
print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
if memory_used > 0:
print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
print(f" ✓ Efficiency score: {efficiency:.4f}")
except Exception as e:
if verbose:
print(f" ✗ Error: {str(e)}")
# Clean up
for manager in managers:
try:
await manager.close()
except:
pass
# If no successful results, return None
if not results:
return None
# Find best configurations
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
# Only consider memory if available
memory_results = [r for r in results if r["memory_used"] > 0]
if memory_results:
lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
else:
lowest_memory = fastest
# Find most efficient (balanced speed vs memory)
optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
# Print summary
if verbose:
print("\n=== OPTIMAL CONFIGURATIONS ===")
print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
if lowest_memory["memory_used"] > 0:
print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
return {
"fastest": fastest,
"lowest_memory": lowest_memory,
"optimal": optimal,
"all_configs": results
}
async def run_tests():
"""Run all tests sequentially."""
results = []
# Find optimal configuration using our utility function
configs = await find_optimal_browser_config(
total_urls=20, # Use a small number for faster testing
verbose=True,
rate_limit_delay=0.2 # 200ms delay between page loads to avoid rate limiting
)
if configs:
# Show the optimal configuration
optimal = configs["optimal"]
print(f"\n🎯 Recommended configuration for production use:")
print(f" {optimal['browser_count']} browsers with distribution {optimal['distribution']}")
print(f" Estimated performance: {optimal['pages_per_second']:.1f} pages/second")
results.append(True)
else:
print("\n❌ Failed to find optimal configuration")
results.append(False)
# Print summary
total = len(results)
passed = sum(results)
print(f"\nTests complete: {passed}/{total} passed")
if passed == total:
print("All tests passed!")
else:
print(f"{total - passed} tests failed")
if __name__ == "__main__":
asyncio.run(run_tests())

View File

@@ -0,0 +1,316 @@
"""Test examples for PlaywrightBrowserStrategy.
These examples demonstrate the functionality of PlaywrightBrowserStrategy
and serve as functional tests.
"""
import asyncio
import os
import re
import sys
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser import BrowserManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def test_start_close():
# Create browser config for standard Playwright
browser_config = BrowserConfig(
headless=True,
viewport_width=1280,
viewport_height=800
)
# Create browser manager with the config
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
for _ in range(4):
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Get a page
page, context = await manager.get_page(CrawlerRunConfig())
logger.info("Got page successfully", tag="TEST")
# Navigate to a website
await page.goto("https://example.com")
logger.info("Navigated to example.com", tag="TEST")
# Get page title
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
await asyncio.sleep(1) # Wait for a moment before restarting
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Ensure cleanup
try:
await manager.close()
except:
pass
return False
return True
async def test_playwright_basic():
"""Test basic Playwright browser functionality."""
logger.info("Testing standard Playwright browser", tag="TEST")
# Create browser config for standard Playwright
browser_config = BrowserConfig(
headless=True,
viewport_width=1280,
viewport_height=800
)
# Create browser manager with the config
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Create crawler config
crawler_config = CrawlerRunConfig(url="https://example.com")
# Get a page
page, context = await manager.get_page(crawler_config)
logger.info("Got page successfully", tag="TEST")
# Navigate to a website
await page.goto("https://example.com")
logger.info("Navigated to example.com", tag="TEST")
# Get page title
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Ensure cleanup
try:
await manager.close()
except:
pass
return False
async def test_playwright_text_mode():
"""Test Playwright browser in text-only mode."""
logger.info("Testing Playwright text mode", tag="TEST")
# Create browser config with text mode enabled
browser_config = BrowserConfig(
headless=True,
text_mode=True # Enable text-only mode
)
# Create browser manager with the config
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
# Start the browser
await manager.start()
logger.info("Browser started successfully in text mode", tag="TEST")
# Get a page
crawler_config = CrawlerRunConfig(url="https://example.com")
page, context = await manager.get_page(crawler_config)
# Navigate to a website
await page.goto("https://example.com")
logger.info("Navigated to example.com", tag="TEST")
# Get page title
title = await page.title()
logger.info(f"Page title: {title}", tag="TEST")
# Check if images are blocked in text mode
# We'll check if any image requests were made
has_images = False
async with page.expect_request("**/*.{png,jpg,jpeg,gif,webp,svg}", timeout=1000) as request_info:
try:
# Try to load a page with images
await page.goto("https://picsum.photos/", wait_until="domcontentloaded")
request = await request_info.value
has_images = True
except:
# Timeout without image requests means text mode is working
has_images = False
logger.info(f"Text mode image blocking working: {not has_images}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return True
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Ensure cleanup
try:
await manager.close()
except:
pass
return False
async def test_playwright_context_reuse():
"""Test context caching and reuse with identical configurations."""
logger.info("Testing context reuse with identical configurations", tag="TEST")
# Create browser config
browser_config = BrowserConfig(headless=True)
# Create browser manager
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
# Start the browser
await manager.start()
logger.info("Browser started successfully", tag="TEST")
# Create identical crawler configs
crawler_config1 = CrawlerRunConfig(
css_selector="body",
)
crawler_config2 = CrawlerRunConfig(
css_selector="body",
)
# Get pages with these configs
page1, context1 = await manager.get_page(crawler_config1)
page2, context2 = await manager.get_page(crawler_config2)
# Check if contexts are reused
is_same_context = context1 == context2
logger.info(f"Contexts reused: {is_same_context}", tag="TEST")
# Now try with a different config
crawler_config3 = CrawlerRunConfig()
page3, context3 = await manager.get_page(crawler_config3)
# This should be a different context
is_different_context = context1 != context3
logger.info(f"Different contexts for different configs: {is_different_context}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
# Both tests should pass for success
return is_same_context and is_different_context
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Ensure cleanup
try:
await manager.close()
except:
pass
return False
async def test_playwright_session_management():
"""Test session management with Playwright browser."""
logger.info("Testing session management with Playwright browser", tag="TEST")
browser_config = BrowserConfig(
headless=True
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
try:
await manager.start()
logger.info("Browser launched successfully", tag="TEST")
# Create two sessions
session1_id = "playwright_session_1"
session2_id = "playwright_session_2"
# Set up first session
crawler_config1 = CrawlerRunConfig(session_id=session1_id, url="https://example.com")
page1, context1 = await manager.get_page(crawler_config1)
await page1.goto("https://example.com")
await page1.evaluate("localStorage.setItem('playwright_session1_data', 'test_value1')")
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
# Set up second session
crawler_config2 = CrawlerRunConfig(session_id=session2_id, url="https://example.org")
page2, context2 = await manager.get_page(crawler_config2)
await page2.goto("https://example.org")
await page2.evaluate("localStorage.setItem('playwright_session2_data', 'test_value2')")
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
# Get first session again
page1_again, context1_again = await manager.get_page(crawler_config1)
# Verify it's the same page and data persists
is_same_page = page1 == page1_again
is_same_context = context1 == context1_again
data1 = await page1_again.evaluate("localStorage.getItem('playwright_session1_data')")
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
# Kill first session
await manager.kill_session(session1_id)
logger.info(f"Killed session 1", tag="TEST")
# Verify second session still works
data2 = await page2.evaluate("localStorage.getItem('playwright_session2_data')")
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
# Clean up
await manager.close()
logger.info("Browser closed successfully", tag="TEST")
return is_same_page and is_same_context and data1 == "test_value1" and data2 == "test_value2"
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
try:
await manager.close()
except:
pass
return False
async def run_tests():
"""Run all tests sequentially."""
results = []
# results.append(await test_start_close())
# results.append(await test_playwright_basic())
# results.append(await test_playwright_text_mode())
# results.append(await test_playwright_context_reuse())
results.append(await test_playwright_session_management())
# Print summary
total = len(results)
passed = sum(results)
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
if passed == total:
logger.success("All tests passed!", tag="SUMMARY")
else:
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
if __name__ == "__main__":
asyncio.run(run_tests())

View File

@@ -0,0 +1,176 @@
"""Test examples for BrowserProfileManager.
These examples demonstrate the functionality of BrowserProfileManager
and serve as functional tests.
"""
import asyncio
import os
import sys
import uuid
import shutil
# Add the project root to Python path if running directly
if __name__ == "__main__":
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
from crawl4ai.browser import BrowserManager, BrowserProfileManager
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
# Create a logger for clear terminal output
logger = AsyncLogger(verbose=True, log_file=None)
async def test_profile_creation():
"""Test creating and managing browser profiles."""
logger.info("Testing profile creation and management", tag="TEST")
profile_manager = BrowserProfileManager(logger=logger)
try:
# List existing profiles
profiles = profile_manager.list_profiles()
logger.info(f"Found {len(profiles)} existing profiles", tag="TEST")
# Generate a unique profile name for testing
test_profile_name = f"test-profile-{uuid.uuid4().hex[:8]}"
# Create a test profile directory
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
# Create a dummy Preferences file to simulate a Chrome profile
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
f.write("{\"test\": true}")
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
# Verify the profile is now in the list
profiles = profile_manager.list_profiles()
profile_found = any(p["name"] == test_profile_name for p in profiles)
logger.info(f"Profile found in list: {profile_found}", tag="TEST")
# Try to get the profile path
retrieved_path = profile_manager.get_profile_path(test_profile_name)
path_match = retrieved_path == profile_path
logger.info(f"Retrieved correct profile path: {path_match}", tag="TEST")
# Delete the profile
success = profile_manager.delete_profile(test_profile_name)
logger.info(f"Profile deletion successful: {success}", tag="TEST")
# Verify it's gone
profiles_after = profile_manager.list_profiles()
profile_removed = not any(p["name"] == test_profile_name for p in profiles_after)
logger.info(f"Profile removed from list: {profile_removed}", tag="TEST")
# Clean up just in case
if os.path.exists(profile_path):
shutil.rmtree(profile_path, ignore_errors=True)
return profile_found and path_match and success and profile_removed
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Clean up test directory
try:
if os.path.exists(profile_path):
shutil.rmtree(profile_path, ignore_errors=True)
except:
pass
return False
async def test_profile_with_browser():
"""Test using a profile with a browser."""
logger.info("Testing using a profile with a browser", tag="TEST")
profile_manager = BrowserProfileManager(logger=logger)
test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
profile_path = None
try:
# Create a test profile directory
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
# Create a dummy Preferences file to simulate a Chrome profile
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
f.write("{\"test\": true}")
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
# Now use this profile with a browser
browser_config = BrowserConfig(
user_data_dir=profile_path,
headless=True
)
manager = BrowserManager(browser_config=browser_config, logger=logger)
# Start the browser with the profile
await manager.start()
logger.info("Browser started with profile", tag="TEST")
# Create a page
crawler_config = CrawlerRunConfig()
page, context = await manager.get_page(crawler_config)
# Navigate and set some data to verify profile works
await page.goto("https://example.com")
await page.evaluate("localStorage.setItem('test_data', 'profile_value')")
# Close browser
await manager.close()
logger.info("First browser session closed", tag="TEST")
# Create a new browser with the same profile
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
await manager2.start()
logger.info("Second browser session started with same profile", tag="TEST")
# Get a page and check if the data persists
page2, context2 = await manager2.get_page(crawler_config)
await page2.goto("https://example.com")
data = await page2.evaluate("localStorage.getItem('test_data')")
# Verify data persisted
data_persisted = data == "profile_value"
logger.info(f"Data persisted across sessions: {data_persisted}", tag="TEST")
# Clean up
await manager2.close()
logger.info("Second browser session closed", tag="TEST")
# Delete the test profile
success = profile_manager.delete_profile(test_profile_name)
logger.info(f"Test profile deleted: {success}", tag="TEST")
return data_persisted and success
except Exception as e:
logger.error(f"Test failed: {str(e)}", tag="TEST")
# Clean up
try:
if profile_path and os.path.exists(profile_path):
shutil.rmtree(profile_path, ignore_errors=True)
except:
pass
return False
async def run_tests():
"""Run all tests sequentially."""
results = []
results.append(await test_profile_creation())
results.append(await test_profile_with_browser())
# Print summary
total = len(results)
passed = sum(results)
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
if passed == total:
logger.success("All tests passed!", tag="SUMMARY")
else:
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
if __name__ == "__main__":
asyncio.run(run_tests())

133
tests/cli/test_cli.py Normal file
View File

@@ -0,0 +1,133 @@
import pytest
from click.testing import CliRunner
from pathlib import Path
import json
import yaml
from crawl4ai.cli import cli, load_config_file, parse_key_values
import tempfile
import os
import click
@pytest.fixture
def runner():
return CliRunner()
@pytest.fixture
def temp_config_dir():
with tempfile.TemporaryDirectory() as tmpdir:
old_home = os.environ.get('HOME')
os.environ['HOME'] = tmpdir
yield Path(tmpdir)
if old_home:
os.environ['HOME'] = old_home
@pytest.fixture
def sample_configs(temp_config_dir):
configs = {
'browser.yml': {
'headless': True,
'viewport_width': 1280,
'user_agent_mode': 'random'
},
'crawler.yml': {
'cache_mode': 'bypass',
'wait_until': 'networkidle',
'scan_full_page': True
},
'extract_css.yml': {
'type': 'json-css',
'params': {'verbose': True}
},
'css_schema.json': {
'name': 'ArticleExtractor',
'baseSelector': '.article',
'fields': [
{'name': 'title', 'selector': 'h1.title', 'type': 'text'},
{'name': 'link', 'selector': 'a.read-more', 'type': 'attribute', 'attribute': 'href'}
]
}
}
for filename, content in configs.items():
path = temp_config_dir / filename
with open(path, 'w') as f:
if filename.endswith('.yml'):
yaml.dump(content, f)
else:
json.dump(content, f)
return {name: str(temp_config_dir / name) for name in configs}
class TestCLIBasics:
def test_help(self, runner):
result = runner.invoke(cli, ['--help'])
assert result.exit_code == 0
assert 'Crawl4AI CLI' in result.output
def test_examples(self, runner):
result = runner.invoke(cli, ['--example'])
assert result.exit_code == 0
assert 'Examples' in result.output
def test_missing_url(self, runner):
result = runner.invoke(cli)
assert result.exit_code != 0
assert 'URL argument is required' in result.output
class TestConfigParsing:
def test_parse_key_values_basic(self):
result = parse_key_values(None, None, "key1=value1,key2=true")
assert result == {'key1': 'value1', 'key2': True}
def test_parse_key_values_invalid(self):
with pytest.raises(click.BadParameter):
parse_key_values(None, None, "invalid_format")
class TestConfigLoading:
def test_load_yaml_config(self, sample_configs):
config = load_config_file(sample_configs['browser.yml'])
assert config['headless'] is True
assert config['viewport_width'] == 1280
def test_load_json_config(self, sample_configs):
config = load_config_file(sample_configs['css_schema.json'])
assert config['name'] == 'ArticleExtractor'
assert len(config['fields']) == 2
def test_load_nonexistent_config(self):
with pytest.raises(click.BadParameter):
load_config_file('nonexistent.yml')
class TestLLMConfig:
def test_llm_config_creation(self, temp_config_dir, runner):
def input_simulation(inputs):
return runner.invoke(cli, ['https://example.com', '-q', 'test question'],
input='\n'.join(inputs))
class TestCrawlingFeatures:
def test_basic_crawl(self, runner):
result = runner.invoke(cli, ['https://example.com'])
assert result.exit_code == 0
class TestErrorHandling:
def test_invalid_config_file(self, runner):
result = runner.invoke(cli, [
'https://example.com',
'--browser-config', 'nonexistent.yml'
])
assert result.exit_code != 0
def test_invalid_schema(self, runner, temp_config_dir):
invalid_schema = temp_config_dir / 'invalid_schema.json'
with open(invalid_schema, 'w') as f:
f.write('invalid json')
result = runner.invoke(cli, [
'https://example.com',
'--schema', str(invalid_schema)
])
assert result.exit_code != 0
if __name__ == '__main__':
pytest.main(['-v', '-s', '--tb=native', __file__])

View File

@@ -0,0 +1,113 @@
import json
from crawl4ai import (
CrawlerRunConfig,
DefaultMarkdownGenerator,
RegexChunking,
JsonCssExtractionStrategy,
BM25ContentFilter,
CacheMode
)
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.deep_crawling.filters import FastFilterChain
from crawl4ai.deep_crawling.filters import FastContentTypeFilter, FastDomainFilter
from crawl4ai.deep_crawling.scorers import FastKeywordRelevanceScorer
def create_test_config() -> CrawlerRunConfig:
# Set up content filtering and markdown generation
content_filter = BM25ContentFilter(
user_query="technology articles",
)
markdown_generator = DefaultMarkdownGenerator(
content_filter=content_filter,
options={"ignore_links": False, "body_width": 0}
)
# Set up extraction strategy
extraction_schema = {
"name": "ArticleExtractor",
"baseSelector": "article.content",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "content", "selector": ".article-body", "type": "html"}
]
}
extraction_strategy = JsonCssExtractionStrategy(schema=extraction_schema)
# Set up deep crawling
filter_chain = FastFilterChain([
FastContentTypeFilter(["text/html"]),
FastDomainFilter(blocked_domains=["ads.*"])
])
url_scorer = FastKeywordRelevanceScorer(
keywords=["article", "blog"],
weight=1.0
)
deep_crawl_strategy = BFSDeepCrawlStrategy(
max_depth=3,
filter_chain=filter_chain,
url_scorer=url_scorer
)
# Create the config
config = CrawlerRunConfig(
word_count_threshold=200,
extraction_strategy=extraction_strategy,
chunking_strategy=RegexChunking(patterns=[r"\n\n"]),
markdown_generator=markdown_generator,
css_selector="main.content",
excluded_tags=["nav", "footer"],
keep_attrs=["href", "src"],
cache_mode=CacheMode.BYPASS,
wait_until="networkidle",
page_timeout=30000,
scan_full_page=True,
deep_crawl_strategy=deep_crawl_strategy,
verbose=True,
stream=True
)
return config
def test_config_serialization_cycle():
# Create original config
original_config = create_test_config()
# Dump to serializable dictionary
serialized = original_config.dump()
print(json.dumps(serialized, indent=2))
# Load back into config object
deserialized_config = CrawlerRunConfig.load(serialized)
# Verify core attributes
assert deserialized_config.word_count_threshold == original_config.word_count_threshold
assert deserialized_config.css_selector == original_config.css_selector
assert deserialized_config.excluded_tags == original_config.excluded_tags
assert deserialized_config.keep_attrs == original_config.keep_attrs
assert deserialized_config.cache_mode == original_config.cache_mode
assert deserialized_config.wait_until == original_config.wait_until
assert deserialized_config.page_timeout == original_config.page_timeout
assert deserialized_config.scan_full_page == original_config.scan_full_page
assert deserialized_config.verbose == original_config.verbose
assert deserialized_config.stream == original_config.stream
# Verify complex objects
assert isinstance(deserialized_config.extraction_strategy, JsonCssExtractionStrategy)
assert isinstance(deserialized_config.chunking_strategy, RegexChunking)
assert isinstance(deserialized_config.markdown_generator, DefaultMarkdownGenerator)
assert isinstance(deserialized_config.markdown_generator.content_filter, BM25ContentFilter)
assert isinstance(deserialized_config.deep_crawl_strategy, BFSDeepCrawlStrategy)
# Verify deep crawl strategy configuration
assert deserialized_config.deep_crawl_strategy.max_depth == 3
assert isinstance(deserialized_config.deep_crawl_strategy.filter_chain, FastFilterChain)
assert isinstance(deserialized_config.deep_crawl_strategy.url_scorer, FastKeywordRelevanceScorer)
print("Serialization cycle test passed successfully!")
if __name__ == "__main__":
test_config_serialization_cycle()

175
tests/docker/test_docker.py Normal file
View File

@@ -0,0 +1,175 @@
import requests
import time
import httpx
import asyncio
from typing import Dict, Any
from crawl4ai import (
BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
)
from crawl4ai import LLMConfig
from crawl4ai.docker_client import Crawl4aiDockerClient
class Crawl4AiTester:
def __init__(self, base_url: str = "http://localhost:11235"):
self.base_url = base_url
def submit_and_wait(
self, request_data: Dict[str, Any], timeout: int = 300
) -> Dict[str, Any]:
# Submit crawl job
response = requests.post(f"{self.base_url}/crawl", json=request_data)
task_id = response.json()["task_id"]
print(f"Task ID: {task_id}")
# Poll for result
start_time = time.time()
while True:
if time.time() - start_time > timeout:
raise TimeoutError(
f"Task {task_id} did not complete within {timeout} seconds"
)
result = requests.get(f"{self.base_url}/task/{task_id}")
status = result.json()
if status["status"] == "failed":
print("Task failed:", status.get("error"))
raise Exception(f"Task failed: {status.get('error')}")
if status["status"] == "completed":
return status
time.sleep(2)
async def test_direct_api():
"""Test direct API endpoints without using the client SDK"""
print("\n=== Testing Direct API Calls ===")
# Test 1: Basic crawl with content filtering
browser_config = BrowserConfig(
headless=True,
viewport_width=1200,
viewport_height=800
)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=0
),
options={"ignore_links": True}
)
)
request_data = {
"urls": ["https://example.com"],
"browser_config": browser_config.dump(),
"crawler_config": crawler_config.dump()
}
# Make direct API call
async with httpx.AsyncClient() as client:
response = await client.post(
"http://localhost:8000/crawl",
json=request_data,
timeout=300
)
assert response.status_code == 200
result = response.json()
print("Basic crawl result:", result["success"])
# Test 2: Structured extraction with JSON CSS
schema = {
"baseSelector": "article.post",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "content", "selector": ".content", "type": "html"}
]
}
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=JsonCssExtractionStrategy(schema=schema)
)
request_data["crawler_config"] = crawler_config.dump()
async with httpx.AsyncClient() as client:
response = await client.post(
"http://localhost:8000/crawl",
json=request_data
)
assert response.status_code == 200
result = response.json()
print("Structured extraction result:", result["success"])
# Test 3: Get schema
# async with httpx.AsyncClient() as client:
# response = await client.get("http://localhost:8000/schema")
# assert response.status_code == 200
# schemas = response.json()
# print("Retrieved schemas for:", list(schemas.keys()))
async def test_with_client():
"""Test using the Crawl4AI Docker client SDK"""
print("\n=== Testing Client SDK ===")
async with Crawl4aiDockerClient(verbose=True) as client:
# Test 1: Basic crawl
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed"
)
)
)
result = await client.crawl(
urls=["https://example.com"],
browser_config=browser_config,
crawler_config=crawler_config
)
print("Client SDK basic crawl:", result.success)
# Test 2: LLM extraction with streaming
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=LLMContentFilter(
llm_config=LLMConfig(provider="openai/gpt-40"),
instruction="Extract key technical concepts"
)
),
stream=True
)
async for result in await client.crawl(
urls=["https://example.com"],
browser_config=browser_config,
crawler_config=crawler_config
):
print(f"Streaming result for: {result.url}")
# # Test 3: Get schema
# schemas = await client.get_schema()
# print("Retrieved client schemas for:", list(schemas.keys()))
async def main():
"""Run all tests"""
# Test direct API
print("Testing direct API calls...")
await test_direct_api()
# Test client SDK
print("\nTesting client SDK...")
await test_with_client()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,34 @@
import asyncio
from crawl4ai.docker_client import Crawl4aiDockerClient
from crawl4ai import (
BrowserConfig,
CrawlerRunConfig
)
async def main():
async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
await client.authenticate("test@example.com")
# Non-streaming crawl
results = await client.crawl(
["https://example.com", "https://python.org"],
browser_config=BrowserConfig(headless=True),
crawler_config=CrawlerRunConfig()
)
print(f"Non-streaming results: {results}")
# Streaming crawl
crawler_config = CrawlerRunConfig(stream=True)
async for result in await client.crawl(
["https://example.com", "https://python.org"],
browser_config=BrowserConfig(headless=True),
crawler_config=crawler_config
):
print(f"Streamed result: {result}")
# Get schema
schema = await client.get_schema()
print(f"Schema: {schema}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,596 @@
# ==== File: test_rest_api_deep_crawl.py ====
import pytest
import pytest_asyncio
import httpx
import json
import asyncio
import os
from typing import List, Dict, Any, AsyncGenerator
from dotenv import load_dotenv
load_dotenv() # Load environment variables from .env file if present
# --- Test Configuration ---
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode
DEEP_CRAWL_BASE_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # Used for domain filter
# --- Helper Functions ---
def load_proxies_from_env() -> List[Dict]:
"""Load proxies from PROXIES environment variable"""
proxies = []
proxies_str = os.getenv("PROXIES", "")
if not proxies_str:
print("PROXIES environment variable not set or empty.")
return proxies
try:
proxy_list = proxies_str.split(",")
for proxy in proxy_list:
proxy = proxy.strip()
if not proxy:
continue
parts = proxy.split(":")
if len(parts) == 4:
ip, port, username, password = parts
proxies.append({
"server": f"http://{ip}:{port}", # Assuming http, adjust if needed
"username": username,
"password": password,
"ip": ip # Store original IP if available
})
elif len(parts) == 2: # ip:port only
ip, port = parts
proxies.append({
"server": f"http://{ip}:{port}",
"ip": ip
})
else:
print(f"Skipping invalid proxy string format: {proxy}")
except Exception as e:
print(f"Error loading proxies from environment: {e}")
return proxies
async def check_server_health(client: httpx.AsyncClient):
"""Check if the server is healthy before running tests."""
try:
response = await client.get("/health")
response.raise_for_status()
print(f"\nServer healthy: {response.json()}")
return True
except (httpx.RequestError, httpx.HTTPStatusError) as e:
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False):
"""Asserts the basic structure of a single crawl result."""
assert isinstance(result, dict)
assert "url" in result
assert "success" in result
assert "html" in result # Basic crawls should return HTML
assert "metadata" in result
assert isinstance(result["metadata"], dict)
assert "depth" in result["metadata"] # Deep crawls add depth
if check_ssl:
assert "ssl_certificate" in result # Check if SSL info is present
assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
"""Processes an NDJSON streaming response."""
results = []
completed = False
async for line in response.aiter_lines():
if line:
try:
data = json.loads(line)
if data.get("status") == "completed":
completed = True
break # Stop processing after completion marker
elif data.get("url"): # Ensure it looks like a result object
results.append(data)
else:
print(f"Received non-result JSON line: {data}") # Log other status messages if needed
except json.JSONDecodeError:
pytest.fail(f"Failed to decode JSON line: {line}")
assert completed, "Streaming response did not end with a completion marker."
return results
# --- Pytest Fixtures ---
@pytest_asyncio.fixture(scope="function")
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
"""Provides an async HTTP client"""
# Increased timeout for potentially longer deep crawls
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
yield client
# No explicit close needed with 'async with'
# --- Test Class ---
@pytest.mark.asyncio
class TestDeepCrawlEndpoints:
@pytest_asyncio.fixture(autouse=True)
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
"""Fixture to ensure server is healthy before each test in the class."""
await check_server_health(async_client)
# 1. Basic Deep Crawl
async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl with limited depth and pages."""
max_depth = 1
max_pages = 3 # start_url + 2 more
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS", # Use string value for CacheMode
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
# Minimal filters for basic test
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "DomainFilter",
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
}
]
}
}
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
assert data["success"] is True
assert isinstance(data["results"], list)
assert len(data["results"]) > 1 # Should be more than just the start URL
assert len(data["results"]) <= max_pages # Respect max_pages
found_depth_0 = False
found_depth_1 = False
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert DEEP_CRAWL_DOMAIN in result["url"]
depth = result["metadata"]["depth"]
assert depth <= max_depth
if depth == 0: found_depth_0 = True
if depth == 1: found_depth_1 = True
assert found_depth_0
assert found_depth_1
# 2. Deep Crawl with Filtering
async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl with content type and domain filters."""
max_depth = 1
max_pages = 5
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "DomainFilter",
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
},
{
"type": "ContentTypeFilter",
"params": {"allowed_types": ["text/html"]}
},
# Example: Exclude specific paths using regex
{
"type": "URLPatternFilter",
"params": {
"patterns": ["*/category-3/*"], # Block category 3
"reverse": True # Block if match
}
}
]
}
}
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
assert data["success"] is True
assert len(data["results"]) > 0
assert len(data["results"]) <= max_pages
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert DEEP_CRAWL_DOMAIN in result["url"]
assert "category-3" not in result["url"] # Check if filter worked
assert result["metadata"]["depth"] <= max_depth
# 3. Deep Crawl with Scoring
async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl with URL scoring."""
max_depth = 1
max_pages = 4
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
"filter_chain": { # Keep basic domain filter
"type": "FilterChain",
"params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
},
"url_scorer": { # Add scorer
"type": "CompositeScorer",
"params": {
"scorers": [
{ # Favor pages with 'product' in the URL
"type": "KeywordRelevanceScorer",
"params": {"keywords": ["product"], "weight": 1.0}
},
{ # Penalize deep paths slightly
"type": "PathDepthScorer",
"params": {"optimal_depth": 2, "weight": -0.2}
}
]
}
},
# Set a threshold if needed: "score_threshold": 0.1
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
assert data["success"] is True
assert len(data["results"]) > 0
assert len(data["results"]) <= max_pages
# Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
print(f"Product URLs found among depth > 0 results: {product_urls_found}")
# We expect scoring to prioritize product pages if available within limits
# assert product_urls_found # This might be too strict depending on site structure and limits
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["metadata"]["depth"] <= max_depth
# 4. Deep Crawl with CSS Extraction
async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl combined with JsonCssExtractionStrategy."""
max_depth = 6 # Go deep enough to reach product pages
max_pages = 20
# Schema to extract product details
product_schema = {
"name": "ProductDetails",
"baseSelector": "div.container", # Base for product page
"fields": [
{"name": "product_title", "selector": "h1", "type": "text"},
{"name": "price", "selector": ".product-price", "type": "text"},
{"name": "description", "selector": ".product-description p", "type": "text"},
{"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
{"name": "spec_name", "selector": ".spec-name", "type": "text"},
{"name": "spec_value", "selector": ".spec-value", "type": "text"}
]}
]
}
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"extraction_strategy": { # Apply extraction to ALL crawled pages
"type": "JsonCssExtractionStrategy",
"params": {"schema": {"type": "dict", "value": product_schema}}
},
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
"filter_chain": { # Only crawl HTML on our domain
"type": "FilterChain",
"params": {
"filters": [
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
]
}
}
# Optional: Add scoring to prioritize product pages for extraction
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
assert data["success"] is True
assert len(data["results"]) > 0
# assert len(data["results"]) <= max_pages
found_extracted_product = False
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "extracted_content" in result
if "product_" in result["url"]: # Check product pages specifically
assert result["extracted_content"] is not None
try:
extracted = json.loads(result["extracted_content"])
# Schema returns list even if one base match
assert isinstance(extracted, list)
if extracted:
item = extracted[0]
assert "product_title" in item and item["product_title"]
assert "price" in item and item["price"]
# Specs might be empty list if not found
assert "specs" in item and isinstance(item["specs"], list)
found_extracted_product = True
print(f"Extracted product: {item.get('product_title')}")
except (json.JSONDecodeError, AssertionError, IndexError) as e:
pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
# else:
# # Non-product pages might have None or empty list depending on schema match
# assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
# 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl combined with LLMExtractionStrategy."""
max_depth = 1 # Limit depth to keep LLM calls manageable
max_pages = 3
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"extraction_strategy": { # Apply LLM extraction to crawled pages
"type": "LLMExtractionStrategy",
"params": {
"instruction": "Extract the main H1 title and the text content of the first paragraph.",
"llm_config": { # Example override, rely on server default if possible
"type": "LLMConfig",
"params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
},
"schema": { # Expected JSON output
"type": "dict",
"value": {
"title": "PageContent", "type": "object",
"properties": {
"h1_title": {"type": "string"},
"first_paragraph": {"type": "string"}
}
}
}
}
},
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
]
}
}
}
}
}
}
}
try:
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
except httpx.RequestError as e:
pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
assert data["success"] is True
assert len(data["results"]) > 0
assert len(data["results"]) <= max_pages
found_llm_extraction = False
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "extracted_content" in result
assert result["extracted_content"] is not None
try:
extracted = json.loads(result["extracted_content"])
if isinstance(extracted, list): extracted = extracted[0] # Handle list output
assert isinstance(extracted, dict)
assert "h1_title" in extracted # Check keys based on schema
assert "first_paragraph" in extracted
found_llm_extraction = True
print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
# 6. Deep Crawl with SSL Certificate Fetching
async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl with fetch_ssl_certificate enabled."""
max_depth = 0 # Only fetch for start URL to keep test fast
max_pages = 1
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"fetch_ssl_certificate": True, # <-- Enable SSL fetching
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
assert result["success"] is True
# Check if SSL info was actually retrieved
if result["ssl_certificate"]:
# Assert directly using dictionary keys
assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
assert "issuer" in result["ssl_certificate"]
assert "subject" in result["ssl_certificate"]
# --- MODIFIED ASSERTIONS ---
assert "not_before" in result["ssl_certificate"] # Check for the actual key
assert "not_after" in result["ssl_certificate"] # Check for the actual key
# --- END MODIFICATIONS ---
assert "fingerprint" in result["ssl_certificate"] # Check another key
# This print statement using .get() already works correctly with dictionaries
print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
else:
# This part remains the same
print("SSL Certificate was null in the result.")
# 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)
async def test_deep_crawl_with_proxies(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl using proxy rotation."""
proxies = load_proxies_from_env()
if not proxies:
pytest.skip("Skipping proxy test: PROXIES environment variable not set or empty.")
print(f"\nTesting with {len(proxies)} proxies loaded from environment.")
max_depth = 1
max_pages = 3
payload = {
"urls": [DEEP_CRAWL_BASE_URL], # Use the dummy site
# Use a BrowserConfig that *might* pick up proxy if set, but rely on CrawlerRunConfig
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"proxy_rotation_strategy": { # <-- Define the strategy
"type": "RoundRobinProxyStrategy",
"params": {
# Convert ProxyConfig dicts back to the serialized format expected by server
"proxies": [{"type": "ProxyConfig", "params": p} for p in proxies]
}
},
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
"filter_chain": {
"type": "FilterChain",
"params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
}
}
}
}
}
}
try:
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
# Proxies often cause connection errors, catch them
pytest.fail(f"Proxy deep crawl failed: {e}. Response: {e.response.text}. Are proxies valid and accessible by the server?")
except httpx.RequestError as e:
pytest.fail(f"Proxy deep crawl request failed: {e}. Are proxies valid and accessible?")
assert data["success"] is True
assert len(data["results"]) > 0
assert len(data["results"]) <= max_pages
# Primary assertion is that the crawl succeeded *with* proxy config
print(f"Proxy deep crawl completed successfully for {len(data['results'])} pages.")
# Verifying specific proxy usage requires server logs or custom headers/responses
# --- Main Execution Block (for running script directly) ---
if __name__ == "__main__":
pytest_args = ["-v", "-s", __file__]
# Example: Run only proxy test
# pytest_args.append("-k test_deep_crawl_with_proxies")
print(f"Running pytest with args: {pytest_args}")
exit_code = pytest.main(pytest_args)
print(f"Pytest finished with exit code: {exit_code}")

View File

@@ -0,0 +1,255 @@
import inspect
from typing import Any, Dict
from enum import Enum
from crawl4ai import LLMConfig
def to_serializable_dict(obj: Any) -> Dict:
"""
Recursively convert an object to a serializable dictionary using {type, params} structure
for complex objects.
"""
if obj is None:
return None
# Handle basic types
if isinstance(obj, (str, int, float, bool)):
return obj
# Handle Enum
if isinstance(obj, Enum):
return {
"type": obj.__class__.__name__,
"params": obj.value
}
# Handle datetime objects
if hasattr(obj, 'isoformat'):
return obj.isoformat()
# Handle lists, tuples, and sets
if isinstance(obj, (list, tuple, set)):
return [to_serializable_dict(item) for item in obj]
# Handle dictionaries - preserve them as-is
if isinstance(obj, dict):
return {
"type": "dict", # Mark as plain dictionary
"value": {str(k): to_serializable_dict(v) for k, v in obj.items()}
}
# Handle class instances
if hasattr(obj, '__class__'):
# Get constructor signature
sig = inspect.signature(obj.__class__.__init__)
params = sig.parameters
# Get current values
current_values = {}
for name, param in params.items():
if name == 'self':
continue
value = getattr(obj, name, param.default)
# Only include if different from default, considering empty values
if not (is_empty_value(value) and is_empty_value(param.default)):
if value != param.default:
current_values[name] = to_serializable_dict(value)
return {
"type": obj.__class__.__name__,
"params": current_values
}
return str(obj)
def from_serializable_dict(data: Any) -> Any:
"""
Recursively convert a serializable dictionary back to an object instance.
"""
if data is None:
return None
# Handle basic types
if isinstance(data, (str, int, float, bool)):
return data
# Handle typed data
if isinstance(data, dict) and "type" in data:
# Handle plain dictionaries
if data["type"] == "dict":
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
# Import from crawl4ai for class instances
import crawl4ai
cls = getattr(crawl4ai, data["type"])
# Handle Enum
if issubclass(cls, Enum):
return cls(data["params"])
# Handle class instances
constructor_args = {
k: from_serializable_dict(v) for k, v in data["params"].items()
}
return cls(**constructor_args)
# Handle lists
if isinstance(data, list):
return [from_serializable_dict(item) for item in data]
# Handle raw dictionaries (legacy support)
if isinstance(data, dict):
return {k: from_serializable_dict(v) for k, v in data.items()}
return data
def is_empty_value(value: Any) -> bool:
"""Check if a value is effectively empty/null."""
if value is None:
return True
if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0:
return True
return False
# if __name__ == "__main__":
# from crawl4ai import (
# CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator,
# PruningContentFilter, BM25ContentFilter, LLMContentFilter,
# JsonCssExtractionStrategy, CosineStrategy, RegexChunking,
# WebScrapingStrategy, LXMLWebScrapingStrategy
# )
# # Test Case 1: BM25 content filtering through markdown generator
# config1 = CrawlerRunConfig(
# cache_mode=CacheMode.BYPASS,
# markdown_generator=DefaultMarkdownGenerator(
# content_filter=BM25ContentFilter(
# user_query="technology articles",
# bm25_threshold=1.2,
# language="english"
# )
# ),
# chunking_strategy=RegexChunking(patterns=[r"\n\n", r"\.\s+"]),
# excluded_tags=["nav", "footer", "aside"],
# remove_overlay_elements=True
# )
# # Serialize
# serialized = to_serializable_dict(config1)
# print("\nSerialized Config:")
# print(serialized)
# # Example output structure would now look like:
# """
# {
# "type": "CrawlerRunConfig",
# "params": {
# "cache_mode": {
# "type": "CacheMode",
# "params": "bypass"
# },
# "markdown_generator": {
# "type": "DefaultMarkdownGenerator",
# "params": {
# "content_filter": {
# "type": "BM25ContentFilter",
# "params": {
# "user_query": "technology articles",
# "bm25_threshold": 1.2,
# "language": "english"
# }
# }
# }
# }
# }
# }
# """
# # Deserialize
# deserialized = from_serializable_dict(serialized)
# print("\nDeserialized Config:")
# print(to_serializable_dict(deserialized))
# # Verify they match
# assert to_serializable_dict(config1) == to_serializable_dict(deserialized)
# print("\nVerification passed: Configuration matches after serialization/deserialization!")
if __name__ == "__main__":
from crawl4ai import (
CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator,
PruningContentFilter, BM25ContentFilter, LLMContentFilter,
JsonCssExtractionStrategy, RegexChunking,
WebScrapingStrategy, LXMLWebScrapingStrategy
)
# Test Case 1: BM25 content filtering through markdown generator
config1 = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=BM25ContentFilter(
user_query="technology articles",
bm25_threshold=1.2,
language="english"
)
),
chunking_strategy=RegexChunking(patterns=[r"\n\n", r"\.\s+"]),
excluded_tags=["nav", "footer", "aside"],
remove_overlay_elements=True
)
# Test Case 2: LLM-based extraction with pruning filter
schema = {
"baseSelector": "article.post",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "content", "selector": ".content", "type": "html"}
]
}
config2 = CrawlerRunConfig(
extraction_strategy=JsonCssExtractionStrategy(schema=schema),
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=0
),
options={"ignore_links": True}
),
scraping_strategy=LXMLWebScrapingStrategy()
)
# Test Case 3:LLM content filter
config3 = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
content_filter=LLMContentFilter(
llm_config = LLMConfig(provider="openai/gpt-4"),
instruction="Extract key technical concepts",
chunk_token_threshold=2000,
overlap_rate=0.1
),
options={"ignore_images": True}
),
scraping_strategy=WebScrapingStrategy()
)
# Test all configurations
test_configs = [config1, config2, config3]
for i, config in enumerate(test_configs, 1):
print(f"\nTesting Configuration {i}:")
# Serialize
serialized = to_serializable_dict(config)
print(f"\nSerialized Config {i}:")
print(serialized)
# Deserialize
deserialized = from_serializable_dict(serialized)
print(f"\nDeserialized Config {i}:")
print(to_serializable_dict(deserialized)) # Convert back to dict for comparison
# Verify they match
assert to_serializable_dict(config) == to_serializable_dict(deserialized)
print(f"\nVerification passed: Configuration {i} matches after serialization/deserialization!")

146
tests/docker/test_server.py Normal file
View File

@@ -0,0 +1,146 @@
import asyncio
import json
from typing import Optional
from urllib.parse import quote
async def test_endpoint(
endpoint: str,
url: str,
params: Optional[dict] = None,
expected_status: int = 200
) -> None:
"""Test an endpoint and print results"""
import aiohttp
params = params or {}
param_str = "&".join(f"{k}={v}" for k, v in params.items())
full_url = f"http://localhost:8000/{endpoint}/{quote(url)}"
if param_str:
full_url += f"?{param_str}"
print(f"\nTesting: {full_url}")
try:
async with aiohttp.ClientSession() as session:
async with session.get(full_url) as response:
status = response.status
try:
data = await response.json()
except:
data = await response.text()
print(f"Status: {status} (Expected: {expected_status})")
if isinstance(data, dict):
print(f"Response: {json.dumps(data, indent=2)}")
else:
print(f"Response: {data[:500]}...") # First 500 chars
assert status == expected_status
return data
except Exception as e:
print(f"Error: {str(e)}")
return None
async def test_llm_task_completion(task_id: str) -> None:
"""Poll task until completion"""
for _ in range(10): # Try 10 times
result = await test_endpoint("llm", task_id)
if result and result.get("status") in ["completed", "failed"]:
return result
print("Task still processing, waiting 5 seconds...")
await asyncio.sleep(5)
print("Task timed out")
async def run_tests():
print("Starting API Tests...")
# Test URLs
urls = [
"example.com",
"https://www.python.org",
"https://news.ycombinator.com/news",
"https://github.com/trending"
]
print("\n=== Testing Markdown Endpoint ===")
for url in[] : #urls:
# Test different filter types
for filter_type in ["raw", "fit", "bm25", "llm"]:
params = {"f": filter_type}
if filter_type in ["bm25", "llm"]:
params["q"] = "extract main content"
# Test with and without cache
for cache in ["0", "1"]:
params["c"] = cache
await test_endpoint("md", url, params)
await asyncio.sleep(1) # Be nice to the server
print("\n=== Testing LLM Endpoint ===")
for url in []: # urls:
# Test basic extraction
result = await test_endpoint(
"llm",
url,
{"q": "Extract title and main content"}
)
if result and "task_id" in result:
print("\nChecking task completion...")
await test_llm_task_completion(result["task_id"])
# Test with schema
schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"content": {"type": "string"},
"links": {"type": "array", "items": {"type": "string"}}
}
}
result = await test_endpoint(
"llm",
url,
{
"q": "Extract content with links",
"s": json.dumps(schema),
"c": "1" # Test with cache
}
)
if result and "task_id" in result:
print("\nChecking schema task completion...")
await test_llm_task_completion(result["task_id"])
await asyncio.sleep(2) # Be nice to the server
print("\n=== Testing Error Cases ===")
# Test invalid URL
await test_endpoint(
"md",
"not_a_real_url",
expected_status=500
)
# Test invalid filter type
await test_endpoint(
"md",
"example.com",
{"f": "invalid"},
expected_status=422
)
# Test LLM without query
await test_endpoint(
"llm",
"example.com"
)
# Test invalid task ID
await test_endpoint(
"llm",
"llm_invalid_task",
expected_status=404
)
print("\nAll tests completed!")
if __name__ == "__main__":
asyncio.run(run_tests())

View File

@@ -0,0 +1,655 @@
import pytest
import pytest_asyncio
import httpx
import json
import asyncio
import os
from typing import List, Dict, Any, AsyncGenerator
from dotenv import load_dotenv
load_dotenv()
# Optional: Import crawl4ai classes directly for reference/easier payload creation aid
# You don't strictly NEED these imports for the tests to run against the server,
# but they help in understanding the structure you are mimicking in JSON.
from crawl4ai import (
BrowserConfig,
CrawlerRunConfig,
CacheMode,
DefaultMarkdownGenerator,
PruningContentFilter,
BM25ContentFilter,
BFSDeepCrawlStrategy,
FilterChain,
ContentTypeFilter,
DomainFilter,
CompositeScorer,
KeywordRelevanceScorer,
PathDepthScorer,
JsonCssExtractionStrategy,
LLMExtractionStrategy,
LLMConfig
)
# --- Test Configuration ---
# BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Make base URL configurable
# Use a known simple HTML page for basic tests
SIMPLE_HTML_URL = "https://httpbin.org/html"
# Use a site suitable for scraping tests
SCRAPE_TARGET_URL = "http://books.toscrape.com/"
# Use a site with internal links for deep crawl tests
DEEP_CRAWL_URL = "https://python.org"
# --- Pytest Fixtures ---
# Use the built-in event_loop fixture from pytest_asyncio
# The custom implementation was causing issues with closing the loop
@pytest_asyncio.fixture(scope="function") # Changed to function scope to avoid event loop issues
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
"""Provides an async HTTP client"""
client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0)
yield client
await client.aclose()
# --- Helper Functions ---
async def check_server_health(client: httpx.AsyncClient):
"""Check if the server is healthy before running tests."""
try:
response = await client.get("/health")
response.raise_for_status()
print(f"\nServer healthy: {response.json()}")
return True
except (httpx.RequestError, httpx.HTTPStatusError) as e:
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
async def assert_crawl_result_structure(result: Dict[str, Any]):
"""Asserts the basic structure of a single crawl result."""
assert isinstance(result, dict)
assert "url" in result
assert "success" in result
assert "html" in result
# Add more common checks if needed
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
"""Processes an NDJSON streaming response."""
results = []
completed = False
async for line in response.aiter_lines():
if line:
try:
data = json.loads(line)
if data.get("status") == "completed":
completed = True
break # Stop processing after completion marker
else:
results.append(data)
except json.JSONDecodeError:
pytest.fail(f"Failed to decode JSON line: {line}")
assert completed, "Streaming response did not end with a completion marker."
return results
# --- Test Class ---
@pytest.mark.asyncio
class TestCrawlEndpoints:
@pytest_asyncio.fixture(autouse=True)
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
"""Fixture to ensure server is healthy before each test in the class."""
await check_server_health(async_client)
# 1. Simple Requests (Primitives)
async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
"""Test /crawl with a single URL and simple config values."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False, # Explicitly false for /crawl
"screenshot": False,
"cache_mode": CacheMode.BYPASS.value # Use enum value
}
}
}
try:
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error: {e}")
print(f"Response content: {e.response.text}")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
# It might be null, missing, or populated depending on the server's default behavior
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
"""Test /crawl/stream with a single URL and simple config values."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True, # Must be true for /crawl/stream
"screenshot": False,
"cache_mode": CacheMode.BYPASS.value
}
}
}
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
response.raise_for_status()
results = await process_streaming_response(response)
assert len(results) == 1
result = results[0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
# 2. Multi-URL and Dispatcher
async def test_multi_url_crawl(self, async_client: httpx.AsyncClient):
"""Test /crawl with multiple URLs, implicitly testing dispatcher."""
urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
payload = {
"urls": urls,
"browser_config": {
"type": "BrowserConfig",
"params": {"headless": True}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"stream": False, "cache_mode": CacheMode.BYPASS.value}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
assert len(data["results"]) == len(urls)
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] in urls
async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient):
"""Test /crawl/stream with multiple URLs."""
urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
payload = {
"urls": urls,
"browser_config": {
"type": "BrowserConfig",
"params": {"headless": True}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"stream": True, "cache_mode": CacheMode.BYPASS.value}
}
}
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
response.raise_for_status()
results = await process_streaming_response(response)
assert len(results) == len(urls)
processed_urls = set()
for result in results:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] in urls
processed_urls.add(result["url"])
assert processed_urls == set(urls) # Ensure all URLs were processed
# 3. Class Values and Nested Classes (Markdown Generator)
async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient):
"""Test /crawl with MarkdownGenerator using PruningContentFilter."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.ENABLED.value, # Test different cache mode
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.5, # Example param
"threshold_type": "relative"
}
}
}
}
}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "markdown" in result
assert isinstance(result["markdown"], dict)
assert "raw_markdown" in result["markdown"]
assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown
assert "Moby-Dick" in result["markdown"]["raw_markdown"]
# Fit markdown content might be different/shorter due to pruning
assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"])
async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient):
"""Test /crawl with MarkdownGenerator using BM25ContentFilter."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "BM25ContentFilter",
"params": {
"user_query": "Herman Melville", # Query for BM25
"bm25_threshold": 0.1, # Lower threshold to increase matches
"language": "english" # Valid parameters
}
}
}
}
}
}
}
try:
print(f"Payload for BM25 test: {json.dumps(payload)}")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "markdown" in result
assert isinstance(result["markdown"], dict)
assert "raw_markdown" in result["markdown"]
assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown
# Print values for debug
print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}")
print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}")
# Either fit_markdown has content (possibly including our query terms)
# or it might be empty if no good BM25 matches were found
# Don't assert specific content since it can be environment-dependent
# 4. Deep Crawling
async def test_deep_crawl(self, async_client: httpx.AsyncClient):
"""Test /crawl with a deep crawl strategy."""
payload = {
"urls": [DEEP_CRAWL_URL], # Start URL
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": CacheMode.BYPASS.value,
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 1, # Limit depth for testing speed
"max_pages": 5, # Limit pages to crawl
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "ContentTypeFilter",
"params": {"allowed_types": ["text/html"]}
},
{
"type": "DomainFilter",
"params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains
}
]
}
},
"url_scorer": {
"type": "CompositeScorer",
"params": {
"scorers": [
{
"type": "KeywordRelevanceScorer",
"params": {"keywords": ["documentation", "tutorial"]}
},
{
"type": "PathDepthScorer",
"params": {"weight": 0.5, "optimal_depth": 2}
}
]
}
}
}
}
}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
# Expect more than 1 result due to deep crawl (start URL + crawled links)
assert len(data["results"]) > 1
assert len(data["results"]) <= 6 # Start URL + max_links=5
start_url_found = False
crawled_urls_found = False
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
# Print URL for debugging
print(f"Crawled URL: {result['url']}")
# Allow URLs that contain python.org (including subdomains like docs.python.org)
assert "python.org" in result["url"]
if result["url"] == DEEP_CRAWL_URL:
start_url_found = True
else:
crawled_urls_found = True
assert start_url_found
assert crawled_urls_found
# 5. Extraction without LLM (JSON/CSS)
async def test_json_css_extraction(self, async_client: httpx.AsyncClient):
"""Test /crawl with JsonCssExtractionStrategy."""
payload = {
"urls": [SCRAPE_TARGET_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.BYPASS.value,
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"params": {
"schema": {
"type": "dict", # IMPORTANT: Wrap schema dict with type/value structure
"value": {
"name": "BookList",
"baseSelector": "ol.row li.col-xs-6", # Select each book item
"fields": [
{"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
{"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
{"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"}
]
}
}
}
}
}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "extracted_content" in result
assert result["extracted_content"] is not None
# Extracted content should be a JSON string representing a list of dicts
try:
extracted_data = json.loads(result["extracted_content"])
assert isinstance(extracted_data, list)
assert len(extracted_data) > 0 # Should find some books
# Check structure of the first extracted item
first_item = extracted_data[0]
assert "title" in first_item
assert "price" in first_item
assert "rating" in first_item
assert "star-rating" in first_item["rating"] # e.g., "star-rating Three"
except (json.JSONDecodeError, AssertionError) as e:
pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
# 6. Extraction with LLM
async def test_llm_extraction(self, async_client: httpx.AsyncClient):
"""
Test /crawl with LLMExtractionStrategy.
NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY)
configured via .llm.env or environment variables.
This test uses the default provider configured in the server's config.yml.
"""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.BYPASS.value,
"extraction_strategy": {
"type": "LLMExtractionStrategy",
"params": {
"instruction": "Extract the main title and the author mentioned in the text into JSON.",
# LLMConfig is implicitly defined by server's config.yml and .llm.env
# If you needed to override provider/token PER REQUEST:
"llm_config": {
"type": "LLMConfig",
"params": {
"provider": "openai/gpt-4o", # Example override
"api_token": os.getenv("OPENAI_API_KEY") # Example override
}
},
"schema": { # Optional: Provide a schema for structured output
"type": "dict", # IMPORTANT: Wrap schema dict
"value": {
"title": "Book Info",
"type": "object",
"properties": {
"title": {"type": "string", "description": "The main title of the work"},
"author": {"type": "string", "description": "The author of the work"}
},
"required": ["title", "author"]
}
}
}
}
}
}
}
try:
response = await async_client.post("/crawl", json=payload)
response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key)
data = response.json()
except httpx.HTTPStatusError as e:
# Catch potential server errors (like 500 due to missing/invalid API keys)
pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.")
except httpx.RequestError as e:
pytest.fail(f"LLM extraction request failed: {e}.")
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "extracted_content" in result
assert result["extracted_content"] is not None
# Extracted content should be JSON (because we provided a schema)
try:
extracted_data = json.loads(result["extracted_content"])
print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification
# Handle both dict and list formats (server returns a list)
if isinstance(extracted_data, list):
assert len(extracted_data) > 0
extracted_item = extracted_data[0] # Take first item
assert isinstance(extracted_item, dict)
assert "title" in extracted_item
assert "author" in extracted_item
assert "Moby-Dick" in extracted_item.get("title", "")
assert "Herman Melville" in extracted_item.get("author", "")
else:
assert isinstance(extracted_data, dict)
assert "title" in extracted_data
assert "author" in extracted_data
assert "Moby-Dick" in extracted_data.get("title", "")
assert "Herman Melville" in extracted_data.get("author", "")
except (json.JSONDecodeError, AssertionError) as e:
pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
except Exception as e: # Catch any other unexpected error
pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
if __name__ == "__main__":
# Define arguments for pytest programmatically
# -v: verbose output
# -s: show print statements immediately (useful for debugging)
# __file__: tells pytest to run tests in the current file
pytest_args = ["-v", "-s", __file__]
# You can add more pytest arguments here if needed, for example:
# '-k test_llm_extraction': Run only the LLM test function
# pytest_args.append("-k test_llm_extraction")
print(f"Running pytest with args: {pytest_args}")
# Execute pytest
exit_code = pytest.main(pytest_args)
print(f"Pytest finished with exit code: {exit_code}")

View File

@@ -0,0 +1,212 @@
import asyncio
import json
from typing import Optional
from urllib.parse import quote
async def get_token(session, email: str = "test@example.com") -> str:
"""Fetch a JWT token from the /token endpoint."""
url = "http://localhost:8000/token"
payload = {"email": email}
print(f"\nFetching token from {url} with email: {email}")
try:
async with session.post(url, json=payload) as response:
status = response.status
data = await response.json()
print(f"Token Response Status: {status}")
print(f"Token Response: {json.dumps(data, indent=2)}")
if status == 200:
return data["access_token"]
else:
raise Exception(f"Failed to get token: {data.get('detail', 'Unknown error')}")
except Exception as e:
print(f"Error fetching token: {str(e)}")
raise
async def test_endpoint(
session,
endpoint: str,
url: str,
token: str,
params: Optional[dict] = None,
expected_status: int = 200
) -> Optional[dict]:
"""Test an endpoint with token and print results."""
params = params or {}
param_str = "&".join(f"{k}={v}" for k, v in params.items())
full_url = f"http://localhost:8000/{endpoint}/{quote(url)}"
if param_str:
full_url += f"?{param_str}"
headers = {"Authorization": f"Bearer {token}"}
print(f"\nTesting: {full_url}")
try:
async with session.get(full_url, headers=headers) as response:
status = response.status
try:
data = await response.json()
except:
data = await response.text()
print(f"Status: {status} (Expected: {expected_status})")
if isinstance(data, dict):
print(f"Response: {json.dumps(data, indent=2)}")
else:
print(f"Response: {data[:500]}...") # First 500 chars
assert status == expected_status, f"Expected {expected_status}, got {status}"
return data
except Exception as e:
print(f"Error: {str(e)}")
return None
async def test_stream_crawl(session, token: str):
"""Test the /crawl/stream endpoint with multiple URLs."""
url = "http://localhost:8000/crawl/stream"
payload = {
"urls": [
"https://example.com",
"https://example.com/page1", # Replicated example.com with variation
"https://example.com/page2", # Replicated example.com with variation
"https://example.com/page3", # Replicated example.com with variation
# "https://www.python.org",
# "https://news.ycombinator.com/news"
],
"browser_config": {"headless": True, "viewport": {"width": 1200}},
"crawler_config": {"stream": True, "cache_mode": "bypass"}
}
headers = {"Authorization": f"Bearer {token}"}
print(f"\nTesting Streaming Crawl: {url}")
print(f"Payload: {json.dumps(payload, indent=2)}")
try:
async with session.post(url, json=payload, headers=headers) as response:
status = response.status
print(f"Status: {status} (Expected: 200)")
assert status == 200, f"Expected 200, got {status}"
# Read streaming response line-by-line (NDJSON)
async for line in response.content:
if line:
data = json.loads(line.decode('utf-8').strip())
print(f"Streamed Result: {json.dumps(data, indent=2)}")
except Exception as e:
print(f"Error in streaming crawl test: {str(e)}")
async def run_tests():
import aiohttp
print("Starting API Tests...")
# Test URLs
urls = [
"example.com",
"https://www.python.org",
"https://news.ycombinator.com/news",
"https://github.com/trending"
]
async with aiohttp.ClientSession() as session:
# Fetch token once and reuse it
token = await get_token(session)
if not token:
print("Aborting tests due to token failure!")
return
print("\n=== Testing Crawl Endpoint ===")
crawl_payload = {
"urls": ["https://example.com"],
"browser_config": {"headless": True},
"crawler_config": {"stream": False}
}
async with session.post(
"http://localhost:8000/crawl",
json=crawl_payload,
headers={"Authorization": f"Bearer {token}"}
) as response:
status = response.status
data = await response.json()
print(f"\nCrawl Endpoint Status: {status}")
print(f"Crawl Response: {json.dumps(data, indent=2)}")
print("\n=== Testing Crawl Stream Endpoint ===")
await test_stream_crawl(session, token)
print("\n=== Testing Markdown Endpoint ===")
for url in []: #urls:
for filter_type in ["raw", "fit", "bm25", "llm"]:
params = {"f": filter_type}
if filter_type in ["bm25", "llm"]:
params["q"] = "extract main content"
for cache in ["0", "1"]:
params["c"] = cache
await test_endpoint(session, "md", url, token, params)
await asyncio.sleep(1) # Be nice to the server
print("\n=== Testing LLM Endpoint ===")
for url in urls:
# Test basic extraction (direct response now)
result = await test_endpoint(
session,
"llm",
url,
token,
{"q": "Extract title and main content"}
)
# Test with schema (direct response)
schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"content": {"type": "string"},
"links": {"type": "array", "items": {"type": "string"}}
}
}
result = await test_endpoint(
session,
"llm",
url,
token,
{
"q": "Extract content with links",
"s": json.dumps(schema),
"c": "1" # Test with cache
}
)
await asyncio.sleep(2) # Be nice to the server
print("\n=== Testing Error Cases ===")
# Test invalid URL
await test_endpoint(
session,
"md",
"not_a_real_url",
token,
expected_status=500
)
# Test invalid filter type
await test_endpoint(
session,
"md",
"example.com",
token,
{"f": "invalid"},
expected_status=422
)
# Test LLM without query (should fail per your server logic)
await test_endpoint(
session,
"llm",
"example.com",
token,
expected_status=400
)
print("\nAll tests completed!")
if __name__ == "__main__":
asyncio.run(run_tests())

View File

@@ -0,0 +1,335 @@
# ==== File: build_dummy_site.py ====
import os
import random
import argparse
from pathlib import Path
from urllib.parse import quote
# --- Configuration ---
NUM_CATEGORIES = 3
NUM_SUBCATEGORIES_PER_CAT = 2 # Results in NUM_CATEGORIES * NUM_SUBCATEGORIES_PER_CAT total L2 categories
NUM_PRODUCTS_PER_SUBCAT = 5 # Products listed on L3 pages
MAX_DEPTH_TARGET = 5 # Explicitly set target depth
# --- Helper Functions ---
def generate_lorem(words=20):
"""Generates simple placeholder text."""
lorem_words = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur",
"adipiscing", "elit", "sed", "do", "eiusmod", "tempor",
"incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]
return " ".join(random.choice(lorem_words) for _ in range(words)).capitalize() + "."
def create_html_page(filepath: Path, title: str, body_content: str, breadcrumbs: list = [], head_extras: str = ""):
"""Creates an HTML file with basic structure and inline CSS."""
os.makedirs(filepath.parent, exist_ok=True)
# Generate breadcrumb HTML using the 'link' provided in the breadcrumbs list
breadcrumb_html = ""
if breadcrumbs:
links_html = " » ".join(f'<a href="{bc["link"]}">{bc["name"]}</a>' for bc in breadcrumbs)
breadcrumb_html = f"<nav class='breadcrumbs'>{links_html} » {title}</nav>"
# Basic CSS for structure identification (kept the same)
css = """
<style>
body {
font-family: sans-serif;
padding: 20px;
background-color: #1e1e1e;
color: #d1d1d1;
}
.container {
max-width: 960px;
margin: auto;
background: #2c2c2c;
padding: 20px;
border-radius: 5px;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.5);
}
h1, h2 {
color: #ccc;
}
a {
color: #9bcdff;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
ul {
list-style: none;
padding-left: 0;
}
li {
margin-bottom: 10px;
}
.category-link,
.subcategory-link,
.product-link,
.details-link,
.reviews-link {
display: block;
padding: 8px;
background-color: #3a3a3a;
border-radius: 3px;
}
.product-preview {
border: 1px solid #444;
padding: 10px;
margin-bottom: 10px;
border-radius: 4px;
background-color: #2a2a2a;
}
.product-title {
color: #d1d1d1;
}
.product-price {
font-weight: bold;
color: #85e085;
}
.product-description,
.product-specs,
.product-reviews {
margin-top: 15px;
line-height: 1.6;
}
.product-specs li {
margin-bottom: 5px;
font-size: 0.9em;
}
.spec-name {
font-weight: bold;
}
.breadcrumbs {
margin-bottom: 20px;
font-size: 0.9em;
color: #888;
}
.breadcrumbs a {
color: #9bcdff;
}
</style>
"""
html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{title} - FakeShop</title>
{head_extras}
{css}
</head>
<body>
<div class="container">
{breadcrumb_html}
<h1>{title}</h1>
{body_content}
</div>
</body>
</html>"""
with open(filepath, "w", encoding="utf-8") as f:
f.write(html_content)
# Keep print statement concise for clarity
# print(f"Created: {filepath}")
def generate_site(base_dir: Path, site_name: str = "FakeShop", base_path: str = ""):
"""Generates the dummy website structure."""
base_dir.mkdir(parents=True, exist_ok=True)
# --- Clean and prepare the base path for URL construction ---
# Ensure it starts with '/' if not empty, and remove any trailing '/'
if base_path:
full_base_path = "/" + base_path.strip('/')
else:
full_base_path = "" # Represents the root
print(f"Using base path for links: '{full_base_path}'")
# --- Level 0: Homepage ---
home_body = "<h2>Welcome to FakeShop!</h2><p>Your one-stop shop for imaginary items.</p><h3>Categories:</h3>\n<ul>"
# Define the *actual* link path for the homepage breadcrumb
home_link_path = f"{full_base_path}/index.html"
breadcrumbs_home = [{"name": "Home", "link": home_link_path}] # Base breadcrumb
# Links *within* the page content should remain relative
for i in range(NUM_CATEGORIES):
cat_name = f"Category-{i+1}"
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
# This path is relative to the current directory (index.html)
cat_relative_page_path = f"{cat_folder_name}/index.html"
home_body += f'<li><a class="category-link" href="{cat_relative_page_path}">{cat_name}</a> - {generate_lorem(10)}</li>'
home_body += "</ul>"
create_html_page(base_dir / "index.html", "Homepage", home_body, []) # No breadcrumbs *on* the homepage itself
# --- Levels 1-5 ---
for i in range(NUM_CATEGORIES):
cat_name = f"Category-{i+1}"
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
cat_dir = base_dir / cat_folder_name
# This is the *absolute* path for the breadcrumb link
cat_link_path = f"{full_base_path}/{cat_folder_name}/index.html"
# Update breadcrumbs list for this level
breadcrumbs_cat = breadcrumbs_home + [{"name": cat_name, "link": cat_link_path}]
# --- Level 1: Category Page ---
cat_body = f"<p>{generate_lorem(15)} for {cat_name}.</p><h3>Sub-Categories:</h3>\n<ul>"
for j in range(NUM_SUBCATEGORIES_PER_CAT):
subcat_name = f"{cat_name}-Sub-{j+1}"
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
# Path relative to the category page
subcat_relative_page_path = f"{subcat_folder_name}/index.html"
cat_body += f'<li><a class="subcategory-link" href="{subcat_relative_page_path}">{subcat_name}</a> - {generate_lorem(8)}</li>'
cat_body += "</ul>"
# Pass the updated breadcrumbs list
create_html_page(cat_dir / "index.html", cat_name, cat_body, breadcrumbs_home) # Parent breadcrumb needed here
for j in range(NUM_SUBCATEGORIES_PER_CAT):
subcat_name = f"{cat_name}-Sub-{j+1}"
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
subcat_dir = cat_dir / subcat_folder_name
# Absolute path for the breadcrumb link
subcat_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/index.html"
# Update breadcrumbs list for this level
breadcrumbs_subcat = breadcrumbs_cat + [{"name": subcat_name, "link": subcat_link_path}]
# --- Level 2: Sub-Category Page (Product List) ---
subcat_body = f"<p>Explore products in {subcat_name}. {generate_lorem(12)}</p><h3>Products:</h3>\n<ul class='product-list'>"
for k in range(NUM_PRODUCTS_PER_SUBCAT):
prod_id = f"P{i+1}{j+1}{k+1:03d}" # e.g., P11001
prod_name = f"{subcat_name} Product {k+1} ({prod_id})"
# Filename relative to the subcategory page
prod_filename = f"product_{prod_id}.html"
# Absolute path for the breadcrumb link
prod_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{prod_filename}"
# Preview on list page (link remains relative)
subcat_body += f"""
<li>
<div class="product-preview">
<a class="product-link" href="{prod_filename}"><strong>{prod_name}</strong></a>
<p>{generate_lorem(10)}</p>
<span class="product-price"{random.uniform(10, 500):.2f}</span>
</div>
</li>"""
# --- Level 3: Product Page ---
prod_price = random.uniform(10, 500)
prod_desc = generate_lorem(40)
prod_specs = {f"Spec {s+1}": generate_lorem(3) for s in range(random.randint(3,6))}
prod_reviews_count = random.randint(0, 150)
# Relative filenames for links on this page
details_filename_relative = f"product_{prod_id}_details.html"
reviews_filename_relative = f"product_{prod_id}_reviews.html"
prod_body = f"""
<p class="product-price">Price: £{prod_price:.2f}</p>
<div class="product-description">
<h2>Description</h2>
<p>{prod_desc}</p>
</div>
<div class="product-specs">
<h2>Specifications</h2>
<ul>
{''.join(f'<li><span class="spec-name">{name}</span>: <span class="spec-value">{value}</span></li>' for name, value in prod_specs.items())}
</ul>
</div>
<div class="product-reviews">
<h2>Reviews</h2>
<p>Total Reviews: <span class="review-count">{prod_reviews_count}</span></p>
</div>
<hr>
<p>
<a class="details-link" href="{details_filename_relative}">View More Details</a> |
<a class="reviews-link" href="{reviews_filename_relative}">See All Reviews</a>
</p>
"""
# Update breadcrumbs list for this level
breadcrumbs_prod = breadcrumbs_subcat + [{"name": prod_name, "link": prod_link_path}]
# Pass the updated breadcrumbs list
create_html_page(subcat_dir / prod_filename, prod_name, prod_body, breadcrumbs_subcat) # Parent breadcrumb needed here
# --- Level 4: Product Details Page ---
details_filename = f"product_{prod_id}_details.html" # Actual filename
# Absolute path for the breadcrumb link
details_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{details_filename}"
details_body = f"<p>This page contains extremely detailed information about {prod_name}.</p>{generate_lorem(100)}"
# Update breadcrumbs list for this level
breadcrumbs_details = breadcrumbs_prod + [{"name": "Details", "link": details_link_path}]
# Pass the updated breadcrumbs list
create_html_page(subcat_dir / details_filename, f"{prod_name} - Details", details_body, breadcrumbs_prod) # Parent breadcrumb needed here
# --- Level 5: Product Reviews Page ---
reviews_filename = f"product_{prod_id}_reviews.html" # Actual filename
# Absolute path for the breadcrumb link
reviews_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{reviews_filename}"
reviews_body = f"<p>All {prod_reviews_count} reviews for {prod_name} are listed here.</p><ul>"
for r in range(prod_reviews_count):
reviews_body += f"<li>Review {r+1}: {generate_lorem(random.randint(15, 50))}</li>"
reviews_body += "</ul>"
# Update breadcrumbs list for this level
breadcrumbs_reviews = breadcrumbs_prod + [{"name": "Reviews", "link": reviews_link_path}]
# Pass the updated breadcrumbs list
create_html_page(subcat_dir / reviews_filename, f"{prod_name} - Reviews", reviews_body, breadcrumbs_prod) # Parent breadcrumb needed here
subcat_body += "</ul>" # Close product-list ul
# Pass the correct breadcrumbs list for the subcategory index page
create_html_page(subcat_dir / "index.html", subcat_name, subcat_body, breadcrumbs_cat) # Parent breadcrumb needed here
# --- Main Execution ---
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate a dummy multi-level retail website.")
parser.add_argument(
"-o", "--output-dir",
type=str,
default="dummy_retail_site",
help="Directory to generate the website in."
)
parser.add_argument(
"-n", "--site-name",
type=str,
default="FakeShop",
help="Name of the fake shop."
)
parser.add_argument(
"-b", "--base-path",
type=str,
default="",
help="Base path for hosting the site (e.g., 'samples/deepcrawl'). Leave empty if hosted at the root."
)
# Optional: Add more args to configure counts if needed
args = parser.parse_args()
output_directory = Path(args.output_dir)
site_name = args.site_name
base_path = args.base_path
print(f"Generating dummy site '{site_name}' in '{output_directory}'...")
# Pass the base_path to the generation function
generate_site(output_directory, site_name, base_path)
print(f"\nCreated {sum(1 for _ in output_directory.rglob('*.html'))} HTML pages.")
print("Dummy site generation complete.")
print(f"To serve locally (example): python -m http.server --directory {output_directory} 8000")
if base_path:
print(f"Access the site at: http://localhost:8000/{base_path.strip('/')}/index.html")
else:
print(f"Access the site at: http://localhost:8000/index.html")

View File

@@ -0,0 +1,56 @@
import asyncio
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
HTTPCrawlerConfig,
CacheMode,
DefaultMarkdownGenerator,
PruningContentFilter
)
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
from crawl4ai.async_logger import AsyncLogger
async def main():
# Initialize HTTP crawler strategy
http_strategy = AsyncHTTPCrawlerStrategy(
browser_config=HTTPCrawlerConfig(
method="GET",
verify_ssl=True,
follow_redirects=True
),
logger=AsyncLogger(verbose=True)
)
# Initialize web crawler with HTTP strategy
async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=0
)
)
)
# Test different URLs
urls = [
"https://example.com",
"https://httpbin.org/get",
"raw://<html><body>Test content</body></html>"
]
for url in urls:
print(f"\n=== Testing {url} ===")
try:
result = await crawler.arun(url=url, config=crawler_config)
print(f"Status: {result.status_code}")
print(f"Raw HTML length: {len(result.html)}")
if hasattr(result, 'markdown'):
print(f"Markdown length: {len(result.markdown.raw_markdown)}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,46 @@
import asyncio
import time
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
async def main():
"""Example deep crawl of documentation site."""
filter_chain = FilterChain([
URLPatternFilter(patterns=["*2025*"]),
DomainFilter(allowed_domains=["techcrunch.com"]),
ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
ContentTypeFilter(allowed_types=["text/html","application/javascript"])
])
config = CrawlerRunConfig(
deep_crawl_strategy = BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
filter_chain=filter_chain,
url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
),
stream=False,
verbose=True,
cache_mode=CacheMode.BYPASS,
scraping_strategy=LXMLWebScrapingStrategy()
)
async with AsyncWebCrawler() as crawler:
print("Starting deep crawl in streaming mode:")
config.stream = True
start_time = time.perf_counter()
async for result in await crawler.arun(
url="https://techcrunch.com",
config=config
):
print(f"{result.url} (Depth: {result.metadata.get('depth', 0)})")
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,106 @@
"""
Tests for the content_source parameter in markdown generation.
"""
import unittest
import asyncio
from unittest.mock import patch, MagicMock
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.async_configs import CrawlerRunConfig
from crawl4ai.models import MarkdownGenerationResult
HTML_SAMPLE = """
<html>
<head><title>Test Page</title></head>
<body>
<h1>Test Content</h1>
<p>This is a test paragraph.</p>
<div class="container">
<p>This is content within a container.</p>
</div>
</body>
</html>
"""
class TestContentSourceParameter(unittest.TestCase):
"""Test cases for the content_source parameter in markdown generation."""
def setUp(self):
"""Set up test fixtures."""
self.loop = asyncio.new_event_loop()
asyncio.set_event_loop(self.loop)
def tearDown(self):
"""Tear down test fixtures."""
self.loop.close()
def test_default_content_source(self):
"""Test that the default content_source is 'cleaned_html'."""
# Can't directly instantiate abstract class, so just test DefaultMarkdownGenerator
generator = DefaultMarkdownGenerator()
self.assertEqual(generator.content_source, "cleaned_html")
def test_custom_content_source(self):
"""Test that content_source can be customized."""
generator = DefaultMarkdownGenerator(content_source="fit_html")
self.assertEqual(generator.content_source, "fit_html")
@patch('crawl4ai.markdown_generation_strategy.CustomHTML2Text')
def test_html_processing_using_input_html(self, mock_html2text):
"""Test that generate_markdown uses input_html parameter."""
# Setup mock
mock_instance = MagicMock()
mock_instance.handle.return_value = "# Test Content\n\nThis is a test paragraph."
mock_html2text.return_value = mock_instance
# Create generator and call generate_markdown
generator = DefaultMarkdownGenerator()
result = generator.generate_markdown(input_html="<h1>Test Content</h1><p>This is a test paragraph.</p>")
# Verify input_html was passed to HTML2Text handler
mock_instance.handle.assert_called_once()
# Get the first positional argument
args, _ = mock_instance.handle.call_args
self.assertEqual(args[0], "<h1>Test Content</h1><p>This is a test paragraph.</p>")
# Check result
self.assertIsInstance(result, MarkdownGenerationResult)
self.assertEqual(result.raw_markdown, "# Test Content\n\nThis is a test paragraph.")
def test_html_source_selection_logic(self):
"""Test that the HTML source selection logic works correctly."""
# We'll test the dispatch pattern directly to avoid async complexities
# Create test data
raw_html = "<html><body><h1>Raw HTML</h1></body></html>"
cleaned_html = "<html><body><h1>Cleaned HTML</h1></body></html>"
fit_html = "<html><body><h1>Preprocessed HTML</h1></body></html>"
# Test the dispatch pattern
html_source_selector = {
"raw_html": lambda: raw_html,
"cleaned_html": lambda: cleaned_html,
"fit_html": lambda: fit_html,
}
# Test Case 1: content_source="cleaned_html"
source_lambda = html_source_selector.get("cleaned_html")
self.assertEqual(source_lambda(), cleaned_html)
# Test Case 2: content_source="raw_html"
source_lambda = html_source_selector.get("raw_html")
self.assertEqual(source_lambda(), raw_html)
# Test Case 3: content_source="fit_html"
source_lambda = html_source_selector.get("fit_html")
self.assertEqual(source_lambda(), fit_html)
# Test Case 4: Invalid content_source falls back to cleaned_html
source_lambda = html_source_selector.get("invalid_source", lambda: cleaned_html)
self.assertEqual(source_lambda(), cleaned_html)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,17 @@
# example_usageexample_usageexample_usage# example_usage.py
import asyncio
from crawl4ai.crawlers import get_crawler
async def main():
# Get the registered crawler
example_crawler = get_crawler("example_site.content")
# Crawl example.com
result = await example_crawler(url="https://example.com")
print(result)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,46 @@
import asyncio
import time
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
async def main():
"""Example deep crawl of documentation site."""
config = CrawlerRunConfig(
deep_crawl_strategy = BFSDeepCrawlStrategy(
max_depth=2,
include_external=False
),
stream=False,
verbose=True,
cache_mode=CacheMode.BYPASS,
scraping_strategy=LXMLWebScrapingStrategy()
)
async with AsyncWebCrawler() as crawler:
start_time = time.perf_counter()
print("\nStarting deep crawl in batch mode:")
results = await crawler.arun(
url="https://docs.crawl4ai.com",
config=config
)
print(f"Crawled {len(results)} pages")
print(f"Example page: {results[0].url}")
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")
print("Starting deep crawl in streaming mode:")
config.stream = True
start_time = time.perf_counter()
async for result in await crawler.arun(
url="https://docs.crawl4ai.com",
config=config
):
print(f"{result.url} (Depth: {result.metadata.get('depth', 0)})")
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,279 @@
from crawl4ai.deep_crawling.filters import ContentRelevanceFilter, URLPatternFilter, DomainFilter, ContentTypeFilter, SEOFilter
async def test_pattern_filter():
# Test cases as list of tuples instead of dict for multiple patterns
test_cases = [
# Simple suffix patterns (*.html)
("*.html", {
"https://example.com/page.html": True,
"https://example.com/path/doc.html": True,
"https://example.com/page.htm": False,
"https://example.com/page.html?param=1": True,
}),
# Path prefix patterns (/foo/*)
("*/article/*", {
"https://example.com/article/123": True,
"https://example.com/blog/article/456": True,
"https://example.com/articles/789": False,
"https://example.com/article": False,
}),
# Complex patterns
("blog-*-[0-9]", {
"https://example.com/blog-post-1": True,
"https://example.com/blog-test-9": True,
"https://example.com/blog-post": False,
"https://example.com/blog-post-x": False,
}),
# Multiple patterns case
(["*.pdf", "*/download/*"], {
"https://example.com/doc.pdf": True,
"https://example.com/download/file.txt": True,
"https://example.com/path/download/doc": True,
"https://example.com/uploads/file.txt": False,
}),
# Edge cases
("*", {
"https://example.com": True,
"": True,
"http://test.com/path": True,
}),
# Complex regex
(r"^https?://.*\.example\.com/\d+", {
"https://sub.example.com/123": True,
"http://test.example.com/456": True,
"https://example.com/789": False,
"https://sub.example.com/abc": False,
})
]
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for patterns, test_urls in test_cases:
filter_obj = URLPatternFilter(patterns)
for url, expected in test_urls.items():
result = filter_obj.apply(url)
if result != expected:
print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
return all_passed
# Run tests
print("Running Pattern Filter Tests...")
accuracy_passed = run_accuracy_test()
if accuracy_passed:
print("\n✨ All accuracy tests passed!")
else:
print("\n❌ Some accuracy tests failed!")
async def test_domain_filter():
from itertools import chain
# Test cases
test_cases = [
# Allowed domains
({"allowed": "example.com"}, {
"https://example.com/page": True,
"http://example.com": True,
"https://sub.example.com": False,
"https://other.com": False,
}),
({"allowed": ["example.com", "test.com"]}, {
"https://example.com/page": True,
"https://test.com/home": True,
"https://other.com": False,
}),
# Blocked domains
({"blocked": "malicious.com"}, {
"https://malicious.com": False,
"https://safe.com": True,
"http://malicious.com/login": False,
}),
({"blocked": ["spam.com", "ads.com"]}, {
"https://spam.com": False,
"https://ads.com/banner": False,
"https://example.com": True,
}),
# Allowed and Blocked combination
({"allowed": "example.com", "blocked": "sub.example.com"}, {
"https://example.com": True,
"https://sub.example.com": False,
"https://other.com": False,
}),
]
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for params, test_urls in test_cases:
filter_obj = DomainFilter(
allowed_domains=params.get("allowed"),
blocked_domains=params.get("blocked"),
)
for url, expected in test_urls.items():
result = filter_obj.apply(url)
if result != expected:
print(f"\u274C Failed: Params {params} with URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"\u2705 Passed: Params {params} with URL '{url}'")
return all_passed
# Run tests
print("Running Domain Filter Tests...")
accuracy_passed = run_accuracy_test()
if accuracy_passed:
print("\n\u2728 All accuracy tests passed!")
else:
print("\n\u274C Some accuracy tests failed!")
async def test_content_relevance_filter():
relevance_filter = ContentRelevanceFilter(
query="What was the cause of american civil war?",
threshold=1
)
test_cases = {
"https://en.wikipedia.org/wiki/Cricket": False,
"https://en.wikipedia.org/wiki/American_Civil_War": True,
}
print("\nRunning Content Relevance Filter Tests...")
print("-" * 50)
all_passed = True
for url, expected in test_cases.items():
result = await relevance_filter.apply(url)
if result != expected:
print(f"\u274C Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"\u2705 Passed: URL '{url}'")
if all_passed:
print("\n\u2728 All content relevance tests passed!")
else:
print("\n\u274C Some content relevance tests failed!")
async def test_content_type_filter():
from itertools import chain
# Test cases
test_cases = [
# Allowed single type
({"allowed": "image/png"}, {
"https://example.com/image.png": True,
"https://example.com/photo.jpg": False,
"https://example.com/document.pdf": False,
}),
# Multiple allowed types
({"allowed": ["image/jpeg", "application/pdf"]}, {
"https://example.com/photo.jpg": True,
"https://example.com/document.pdf": True,
"https://example.com/script.js": False,
}),
# No extension should be allowed
({"allowed": "application/json"}, {
"https://example.com/api/data": True,
"https://example.com/data.json": True,
"https://example.com/page.html": False,
}),
# Unknown extensions should not be allowed
({"allowed": "application/octet-stream"}, {
"https://example.com/file.unknown": True,
"https://example.com/archive.zip": False,
"https://example.com/software.exe": False,
}),
]
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for params, test_urls in test_cases:
filter_obj = ContentTypeFilter(
allowed_types=params.get("allowed"),
)
for url, expected in test_urls.items():
result = filter_obj.apply(url)
if result != expected:
print(f"\u274C Failed: Params {params} with URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"\u2705 Passed: Params {params} with URL '{url}'")
return all_passed
# Run tests
print("Running Content Type Filter Tests...")
accuracy_passed = run_accuracy_test()
if accuracy_passed:
print("\n\u2728 All accuracy tests passed!")
else:
print("\n\u274C Some accuracy tests failed!")
async def test_seo_filter():
seo_filter = SEOFilter(threshold=0.5, keywords=["SEO", "search engines", "Optimization"])
test_cases = {
"https://en.wikipedia.org/wiki/Search_engine_optimization": True,
"https://en.wikipedia.org/wiki/Randomness": False,
}
print("\nRunning SEO Filter Tests...")
print("-" * 50)
all_passed = True
for url, expected in test_cases.items():
result = await seo_filter.apply(url)
if result != expected:
print(f"\u274C Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"\u2705 Passed: URL '{url}'")
if all_passed:
print("\n\u2728 All SEO filter tests passed!")
else:
print("\n\u274C Some SEO filter tests failed!")
import asyncio
if __name__ == "__main__":
asyncio.run(test_pattern_filter())
asyncio.run(test_domain_filter())
asyncio.run(test_content_type_filter())
asyncio.run(test_content_relevance_filter())
asyncio.run(test_seo_filter())

View File

@@ -0,0 +1,179 @@
from crawl4ai.deep_crawling.scorers import CompositeScorer, ContentTypeScorer, DomainAuthorityScorer, FreshnessScorer, KeywordRelevanceScorer, PathDepthScorer
def test_scorers():
test_cases = [
# Keyword Scorer Tests
{
"scorer_type": "keyword",
"config": {
"keywords": ["python", "blog"],
"weight": 1.0,
"case_sensitive": False
},
"urls": {
"https://example.com/python-blog": 1.0,
"https://example.com/PYTHON-BLOG": 1.0,
"https://example.com/python-only": 0.5,
"https://example.com/other": 0.0
}
},
# Path Depth Scorer Tests
{
"scorer_type": "path_depth",
"config": {
"optimal_depth": 2,
"weight": 1.0
},
"urls": {
"https://example.com/a/b": 1.0,
"https://example.com/a": 0.5,
"https://example.com/a/b/c": 0.5,
"https://example.com": 0.33333333
}
},
# Content Type Scorer Tests
{
"scorer_type": "content_type",
"config": {
"type_weights": {
".html$": 1.0,
".pdf$": 0.8,
".jpg$": 0.6
},
"weight": 1.0
},
"urls": {
"https://example.com/doc.html": 1.0,
"https://example.com/doc.pdf": 0.8,
"https://example.com/img.jpg": 0.6,
"https://example.com/other.txt": 0.0
}
},
# Freshness Scorer Tests
{
"scorer_type": "freshness",
"config": {
"weight": 1.0, # Remove current_year since original doesn't support it
},
"urls": {
"https://example.com/2024/01/post": 1.0,
"https://example.com/2023/12/post": 0.9,
"https://example.com/2022/post": 0.8,
"https://example.com/no-date": 0.5
}
},
# Domain Authority Scorer Tests
{
"scorer_type": "domain",
"config": {
"domain_weights": {
"python.org": 1.0,
"github.com": 0.8,
"medium.com": 0.6
},
"default_weight": 0.3,
"weight": 1.0
},
"urls": {
"https://python.org/about": 1.0,
"https://github.com/repo": 0.8,
"https://medium.com/post": 0.6,
"https://unknown.com": 0.3
}
}
]
def create_scorer(scorer_type, config):
if scorer_type == "keyword":
return KeywordRelevanceScorer(**config)
elif scorer_type == "path_depth":
return PathDepthScorer(**config)
elif scorer_type == "content_type":
return ContentTypeScorer(**config)
elif scorer_type == "freshness":
return FreshnessScorer(**config,current_year=2024)
elif scorer_type == "domain":
return DomainAuthorityScorer(**config)
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for test_case in test_cases:
print(f"\nTesting {test_case['scorer_type']} scorer:")
scorer = create_scorer(
test_case['scorer_type'],
test_case['config']
)
for url, expected in test_case['urls'].items():
score = round(scorer.score(url), 8)
expected = round(expected, 8)
if abs(score - expected) > 0.00001:
print(f"❌ Scorer Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {score}")
all_passed = False
else:
print(f"✅ Scorer Passed: URL '{url}'")
return all_passed
def run_composite_test():
print("\nTesting Composite Scorer:")
print("-" * 50)
# Create test data
test_urls = {
"https://python.org/blog/2024/01/new-release.html":0.86666667,
"https://github.com/repo/old-code.pdf": 0.62,
"https://unknown.com/random": 0.26
}
# Create composite scorers with all types
scorers = []
for test_case in test_cases:
scorer = create_scorer(
test_case['scorer_type'],
test_case['config']
)
scorers.append(scorer)
composite = CompositeScorer(scorers, normalize=True)
all_passed = True
for url, expected in test_urls.items():
score = round(composite.score(url), 8)
if abs(score - expected) > 0.00001:
print(f"❌ Composite Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {score}")
all_passed = False
else:
print(f"✅ Composite Passed: URL '{url}'")
return all_passed
# Run tests
print("Running Scorer Tests...")
accuracy_passed = run_accuracy_test()
composite_passed = run_composite_test()
if accuracy_passed and composite_passed:
print("\n✨ All tests passed!")
# Note: Already have performance tests in run_scorer_performance_test()
else:
print("\n❌ Some tests failed!")
if __name__ == "__main__":
test_scorers()

View File

@@ -0,0 +1,116 @@
from tkinter import N
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
from crawl4ai.async_logger import AsyncLogger
from crawl4ai import CrawlerRunConfig, HTTPCrawlerConfig
from crawl4ai.async_crawler_strategy import ConnectionTimeoutError
import asyncio
import os
async def main():
"""Test the AsyncHTTPCrawlerStrategy with various scenarios"""
logger = AsyncLogger(verbose=True)
# Initialize the strategy with default HTTPCrawlerConfig
crawler = AsyncHTTPCrawlerStrategy(
browser_config=HTTPCrawlerConfig(),
logger=logger
)
# Test 1: Basic HTTP GET
print("\n=== Test 1: Basic HTTP GET ===")
result = await crawler.crawl("https://example.com")
print(f"Status: {result.status_code}")
print(f"Content length: {len(result.html)}")
print(f"Headers: {dict(result.response_headers)}")
# Test 2: POST request with JSON
print("\n=== Test 2: POST with JSON ===")
crawler.browser_config = crawler.browser_config.clone(
method="POST",
json={"test": "data"},
headers={"Content-Type": "application/json"}
)
try:
result = await crawler.crawl(
"https://httpbin.org/post",
)
print(f"Status: {result.status_code}")
print(f"Response: {result.html[:200]}...")
except Exception as e:
print(f"Error: {e}")
# Test 3: File handling
crawler.browser_config = HTTPCrawlerConfig()
print("\n=== Test 3: Local file handling ===")
# Create a tmp file with test content
from tempfile import NamedTemporaryFile
with NamedTemporaryFile(delete=False) as f:
f.write(b"<html><body>Test content</body></html>")
f.close()
result = await crawler.crawl(f"file://{f.name}")
print(f"File content: {result.html}")
# Test 4: Raw content
print("\n=== Test 4: Raw content handling ===")
raw_html = "raw://<html><body>Raw test content</body></html>"
result = await crawler.crawl(raw_html)
print(f"Raw content: {result.html}")
# Test 5: Custom hooks
print("\n=== Test 5: Custom hooks ===")
async def before_request(url, kwargs):
print(f"Before request to {url}")
kwargs['headers']['X-Custom'] = 'test'
async def after_request(response):
print(f"After request, status: {response.status_code}")
crawler.set_hook('before_request', before_request)
crawler.set_hook('after_request', after_request)
result = await crawler.crawl("https://example.com")
# Test 6: Error handling
print("\n=== Test 6: Error handling ===")
try:
await crawler.crawl("https://nonexistent.domain.test")
except Exception as e:
print(f"Expected error: {e}")
# Test 7: Redirects
print("\n=== Test 7: Redirect handling ===")
crawler.browser_config = HTTPCrawlerConfig(follow_redirects=True)
result = await crawler.crawl("http://httpbin.org/redirect/1")
print(f"Final URL: {result.redirected_url}")
# Test 8: Custom timeout
print("\n=== Test 8: Custom timeout ===")
try:
await crawler.crawl(
"https://httpbin.org/delay/5",
config=CrawlerRunConfig(page_timeout=2)
)
except ConnectionTimeoutError as e:
print(f"Expected timeout: {e}")
# Test 9: SSL verification
print("\n=== Test 9: SSL verification ===")
crawler.browser_config = HTTPCrawlerConfig(verify_ssl=False)
try:
await crawler.crawl("https://expired.badssl.com/")
print("Connected to invalid SSL site with verification disabled")
except Exception as e:
print(f"SSL error: {e}")
# Test 10: Large file streaming
print("\n=== Test 10: Large file streaming ===")
from tempfile import NamedTemporaryFile
with NamedTemporaryFile(delete=False) as f:
f.write(b"<html><body>" + b"X" * 1024 * 1024 * 10 + b"</body></html>")
f.close()
result = await crawler.crawl("file://" + f.name)
print(f"Large file content length: {len(result.html)}")
os.remove(f.name)
crawler.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,6 +1,7 @@
import os
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai import LLMConfig
from crawl4ai.content_filter_strategy import LLMContentFilter
async def test_llm_filter():
@@ -22,8 +23,7 @@ async def test_llm_filter():
# Initialize LLM filter with focused instruction
filter = LLMContentFilter(
provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'),
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
instruction="""
Focus on extracting the core educational content about Python classes.
Include:
@@ -43,8 +43,7 @@ async def test_llm_filter():
)
filter = LLMContentFilter(
provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'),
llm_config = LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
instruction="""
Extract the main educational content while preserving its original wording and substance completely. Your task is to:

213
tests/general/test_mhtml.py Normal file
View File

@@ -0,0 +1,213 @@
# test_mhtml_capture.py
import pytest
import asyncio
import re # For more robust MHTML checks
# Assuming these can be imported directly from the crawl4ai library
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CrawlResult
# A reliable, simple static HTML page for testing
# Using httpbin as it's designed for testing clients
TEST_URL_SIMPLE = "https://httpbin.org/html"
EXPECTED_CONTENT_SIMPLE = "Herman Melville - Moby-Dick"
# A slightly more complex page that might involve JS (good secondary test)
TEST_URL_JS = "https://quotes.toscrape.com/js/"
EXPECTED_CONTENT_JS = "Quotes to Scrape" # Title of the page, which should be present in MHTML
# Removed the custom event_loop fixture as pytest-asyncio provides a default one.
@pytest.mark.asyncio
async def test_mhtml_capture_when_enabled():
"""
Verify that when CrawlerRunConfig has capture_mhtml=True,
the CrawlResult contains valid MHTML content.
"""
# Create a fresh browser config and crawler instance for this test
browser_config = BrowserConfig(headless=True) # Use headless for testing CI/CD
# --- Key: Enable MHTML capture in the run config ---
run_config = CrawlerRunConfig(capture_mhtml=True)
# Create a fresh crawler instance
crawler = AsyncWebCrawler(config=browser_config)
try:
# Start the browser
await crawler.start()
# Perform the crawl with the MHTML-enabled config
result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
# --- Assertions ---
assert result is not None, "Crawler should return a result object"
assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
# 1. Check if the mhtml attribute exists (will fail if CrawlResult not updated)
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
# 2. Check if mhtml is populated
assert result.mhtml is not None, "MHTML content should be captured when enabled"
assert isinstance(result.mhtml, str), "MHTML content should be a string"
assert len(result.mhtml) > 500, "MHTML content seems too short, likely invalid" # Basic sanity check
# 3. Check for MHTML structure indicators (more robust than simple string contains)
# MHTML files are multipart MIME messages
assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE), \
"MHTML should contain 'Content-Type: multipart/related;'"
# Should contain a boundary definition
assert re.search(r"boundary=\"----MultipartBoundary", result.mhtml), \
"MHTML should contain a multipart boundary"
# Should contain the main HTML part
assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE), \
"MHTML should contain a 'Content-Type: text/html' part"
# 4. Check if the *actual page content* is within the MHTML string
# This confirms the snapshot captured the rendered page
assert EXPECTED_CONTENT_SIMPLE in result.mhtml, \
f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the captured MHTML"
# 5. Ensure standard HTML is still present and correct
assert result.html is not None, "Standard HTML should still be present"
assert isinstance(result.html, str), "Standard HTML should be a string"
assert EXPECTED_CONTENT_SIMPLE in result.html, \
f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the standard HTML"
finally:
# Important: Ensure browser is completely closed even if assertions fail
await crawler.close()
# Help the garbage collector clean up
crawler = None
@pytest.mark.asyncio
async def test_mhtml_capture_when_disabled_explicitly():
"""
Verify that when CrawlerRunConfig explicitly has capture_mhtml=False,
the CrawlResult.mhtml attribute is None.
"""
# Create a fresh browser config and crawler instance for this test
browser_config = BrowserConfig(headless=True)
# --- Key: Explicitly disable MHTML capture ---
run_config = CrawlerRunConfig(capture_mhtml=False)
# Create a fresh crawler instance
crawler = AsyncWebCrawler(config=browser_config)
try:
# Start the browser
await crawler.start()
result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
assert result is not None
assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
# 1. Check attribute existence (important for TDD start)
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
# 2. Check mhtml is None
assert result.mhtml is None, "MHTML content should be None when explicitly disabled"
# 3. Ensure standard HTML is still present
assert result.html is not None
assert EXPECTED_CONTENT_SIMPLE in result.html
finally:
# Important: Ensure browser is completely closed even if assertions fail
await crawler.close()
# Help the garbage collector clean up
crawler = None
@pytest.mark.asyncio
async def test_mhtml_capture_when_disabled_by_default():
"""
Verify that if capture_mhtml is not specified (using its default),
the CrawlResult.mhtml attribute is None.
(This assumes the default value for capture_mhtml in CrawlerRunConfig is False)
"""
# Create a fresh browser config and crawler instance for this test
browser_config = BrowserConfig(headless=True)
# --- Key: Use default run config ---
run_config = CrawlerRunConfig() # Do not specify capture_mhtml
# Create a fresh crawler instance
crawler = AsyncWebCrawler(config=browser_config)
try:
# Start the browser
await crawler.start()
result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
assert result is not None
assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
# 1. Check attribute existence
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
# 2. Check mhtml is None (assuming default is False)
assert result.mhtml is None, "MHTML content should be None when using default config (assuming default=False)"
# 3. Ensure standard HTML is still present
assert result.html is not None
assert EXPECTED_CONTENT_SIMPLE in result.html
finally:
# Important: Ensure browser is completely closed even if assertions fail
await crawler.close()
# Help the garbage collector clean up
crawler = None
# Optional: Add a test for a JS-heavy page if needed
@pytest.mark.asyncio
async def test_mhtml_capture_on_js_page_when_enabled():
"""
Verify MHTML capture works on a page requiring JavaScript execution.
"""
# Create a fresh browser config and crawler instance for this test
browser_config = BrowserConfig(headless=True)
run_config = CrawlerRunConfig(
capture_mhtml=True,
# Add a small wait or JS execution if needed for the JS page to fully render
# For quotes.toscrape.com/js/, it renders quickly, but a wait might be safer
# wait_for_timeout=2000 # Example: wait up to 2 seconds
js_code="await new Promise(r => setTimeout(r, 500));" # Small delay after potential load
)
# Create a fresh crawler instance
crawler = AsyncWebCrawler(config=browser_config)
try:
# Start the browser
await crawler.start()
result: CrawlResult = await crawler.arun(TEST_URL_JS, config=run_config)
assert result is not None
assert result.success is True, f"Crawling {TEST_URL_JS} should succeed. Error: {result.error_message}"
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
assert result.mhtml is not None, "MHTML content should be captured on JS page when enabled"
assert isinstance(result.mhtml, str), "MHTML content should be a string"
assert len(result.mhtml) > 500, "MHTML content from JS page seems too short"
# Check for MHTML structure
assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE)
assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE)
# Check for content rendered by JS within the MHTML
assert EXPECTED_CONTENT_JS in result.mhtml, \
f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the captured MHTML"
# Check standard HTML too
assert result.html is not None
assert EXPECTED_CONTENT_JS in result.html, \
f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the standard HTML"
finally:
# Important: Ensure browser is completely closed even if assertions fail
await crawler.close()
# Help the garbage collector clean up
crawler = None
if __name__ == "__main__":
# Use pytest for async tests
pytest.main(["-xvs", __file__])

View File

@@ -0,0 +1,185 @@
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
import asyncio
import aiohttp
from aiohttp import web
import tempfile
import shutil
import os, sys, time, json
async def start_test_server():
app = web.Application()
async def basic_page(request):
return web.Response(text="""
<!DOCTYPE html>
<html>
<head>
<title>Network Request Test</title>
</head>
<body>
<h1>Test Page for Network Capture</h1>
<p>This page performs network requests and console logging.</p>
<img src="/image.png" alt="Test Image">
<script>
console.log("Basic console log");
console.error("Error message");
console.warn("Warning message");
// Make some XHR requests
const xhr = new XMLHttpRequest();
xhr.open('GET', '/api/data', true);
xhr.send();
// Make a fetch request
fetch('/api/json')
.then(response => response.json())
.catch(error => console.error('Fetch error:', error));
// Trigger an error
setTimeout(() => {
try {
nonExistentFunction();
} catch (e) {
console.error("Caught error:", e);
}
}, 100);
</script>
</body>
</html>
""", content_type="text/html")
async def image(request):
# Return a small 1x1 transparent PNG
return web.Response(body=bytes.fromhex('89504E470D0A1A0A0000000D49484452000000010000000108060000001F15C4890000000D4944415478DA63FAFFFF3F030079DB00018D959DE70000000049454E44AE426082'), content_type="image/png")
async def api_data(request):
return web.Response(text="sample data")
async def api_json(request):
return web.json_response({"status": "success", "message": "JSON data"})
# Register routes
app.router.add_get('/', basic_page)
app.router.add_get('/image.png', image)
app.router.add_get('/api/data', api_data)
app.router.add_get('/api/json', api_json)
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, 'localhost', 8080)
await site.start()
return runner
async def test_network_console_capture():
print("\n=== Testing Network and Console Capture ===\n")
# Start test server
runner = await start_test_server()
try:
browser_config = BrowserConfig(headless=True)
# Test with capture disabled (default)
print("\n1. Testing with capture disabled (default)...")
async with AsyncWebCrawler(config=browser_config) as crawler:
config = CrawlerRunConfig(
wait_until="networkidle", # Wait for network to be idle
)
result = await crawler.arun(url="http://localhost:8080/", config=config)
assert result.network_requests is None, "Network requests should be None when capture is disabled"
assert result.console_messages is None, "Console messages should be None when capture is disabled"
print("✓ Default config correctly returns None for network_requests and console_messages")
# Test with network capture enabled
print("\n2. Testing with network capture enabled...")
async with AsyncWebCrawler(config=browser_config) as crawler:
config = CrawlerRunConfig(
wait_until="networkidle", # Wait for network to be idle
capture_network_requests=True
)
result = await crawler.arun(url="http://localhost:8080/", config=config)
assert result.network_requests is not None, "Network requests should be captured"
print(f"✓ Captured {len(result.network_requests)} network requests")
# Check if we have both requests and responses
request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
print(f" - {request_count} requests, {response_count} responses")
# Check if we captured specific resources
urls = [r.get("url") for r in result.network_requests]
has_image = any("/image.png" in url for url in urls)
has_api_data = any("/api/data" in url for url in urls)
has_api_json = any("/api/json" in url for url in urls)
assert has_image, "Should have captured image request"
assert has_api_data, "Should have captured API data request"
assert has_api_json, "Should have captured API JSON request"
print("✓ Captured expected network requests (image, API endpoints)")
# Test with console capture enabled
print("\n3. Testing with console capture enabled...")
async with AsyncWebCrawler(config=browser_config) as crawler:
config = CrawlerRunConfig(
wait_until="networkidle", # Wait for network to be idle
capture_console_messages=True
)
result = await crawler.arun(url="http://localhost:8080/", config=config)
assert result.console_messages is not None, "Console messages should be captured"
print(f"✓ Captured {len(result.console_messages)} console messages")
# Check if we have different types of console messages
message_types = set(msg.get("type") for msg in result.console_messages if "type" in msg)
print(f" - Message types: {', '.join(message_types)}")
# Print all captured messages for debugging
print(" - Captured messages:")
for msg in result.console_messages:
print(f" * Type: {msg.get('type', 'N/A')}, Text: {msg.get('text', 'N/A')}")
# Look for specific messages
messages = [msg.get("text") for msg in result.console_messages if "text" in msg]
has_basic_log = any("Basic console log" in msg for msg in messages)
has_error_msg = any("Error message" in msg for msg in messages)
has_warning_msg = any("Warning message" in msg for msg in messages)
assert has_basic_log, "Should have captured basic console.log message"
assert has_error_msg, "Should have captured console.error message"
assert has_warning_msg, "Should have captured console.warn message"
print("✓ Captured expected console messages (log, error, warning)")
# Test with both captures enabled
print("\n4. Testing with both network and console capture enabled...")
async with AsyncWebCrawler(config=browser_config) as crawler:
config = CrawlerRunConfig(
wait_until="networkidle", # Wait for network to be idle
capture_network_requests=True,
capture_console_messages=True
)
result = await crawler.arun(url="http://localhost:8080/", config=config)
assert result.network_requests is not None, "Network requests should be captured"
assert result.console_messages is not None, "Console messages should be captured"
print(f"✓ Successfully captured both {len(result.network_requests)} network requests and {len(result.console_messages)} console messages")
finally:
await runner.cleanup()
print("\nTest server shutdown")
async def main():
try:
await test_network_console_capture()
print("\n✅ All tests passed successfully!")
except Exception as e:
print(f"\n❌ Test failed: {str(e)}")
raise
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -10,6 +10,7 @@ import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
import json
# Test HTML - A complex job board with companies, departments, and positions

34
tests/hub/test_simple.py Normal file
View File

@@ -0,0 +1,34 @@
# test.py
from crawl4ai import CrawlerHub
import json
async def amazon_example():
if (crawler_cls := CrawlerHub.get("amazon_product")) :
crawler = crawler_cls()
print(f"Crawler version: {crawler_cls.meta['version']}")
print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}")
print(await crawler.run("https://amazon.com/test"))
else:
print("Crawler not found!")
async def google_example():
# Get crawler dynamically
crawler_cls = CrawlerHub.get("google_search")
crawler = crawler_cls()
# Text search
text_results = await crawler.run(
query="apple inc",
search_type="text",
schema_cache_path="/Users/unclecode/.crawl4ai"
)
print(json.dumps(json.loads(text_results), indent=4))
# Image search
# image_results = await crawler.run(query="apple inc", search_type="image")
# print(image_results)
if __name__ == "__main__":
import asyncio
# asyncio.run(amazon_example())
asyncio.run(google_example())

View File

@@ -0,0 +1,80 @@
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, AsyncLoggerBase
import os
from datetime import datetime
class AsyncFileLogger(AsyncLoggerBase):
"""
File-only asynchronous logger that writes logs to a specified file.
"""
def __init__(self, log_file: str):
"""
Initialize the file logger.
Args:
log_file: File path for logging
"""
self.log_file = log_file
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
def _write_to_file(self, level: str, message: str, tag: str):
"""Write a message to the log file."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
with open(self.log_file, "a", encoding="utf-8") as f:
f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
"""Log a debug message to file."""
self._write_to_file("DEBUG", message, tag)
def info(self, message: str, tag: str = "INFO", **kwargs):
"""Log an info message to file."""
self._write_to_file("INFO", message, tag)
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
"""Log a success message to file."""
self._write_to_file("SUCCESS", message, tag)
def warning(self, message: str, tag: str = "WARNING", **kwargs):
"""Log a warning message to file."""
self._write_to_file("WARNING", message, tag)
def error(self, message: str, tag: str = "ERROR", **kwargs):
"""Log an error message to file."""
self._write_to_file("ERROR", message, tag)
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
"""Log URL fetch status to file."""
status = "SUCCESS" if success else "FAILED"
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
self._write_to_file("URL_STATUS", message, tag)
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
"""Log error status to file."""
message = f"{url[:url_length]}... | Error: {error}"
self._write_to_file("ERROR", message, tag)
async def main():
browser_config = BrowserConfig(headless=True, verbose=True)
crawler = AsyncWebCrawler(config=browser_config, logger=AsyncFileLogger("/Users/unclecode/devs/crawl4ai/.private/tmp/crawl.log"))
await crawler.start()
try:
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
)
# Use the crawler multiple times
result = await crawler.arun(
url='https://kidocode.com/',
config=crawl_config
)
if result.success:
print("First crawl - Raw Markdown Length:", len(result.markdown.raw_markdown))
finally:
# Always ensure we close the crawler
await crawler.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,119 @@
# pip install "mcp-sdk[ws]" anyio
import anyio, json
from mcp.client.websocket import websocket_client
from mcp.client.session import ClientSession
async def test_list():
async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
async with ClientSession(r, w) as s:
await s.initialize()
print("tools :", [t.name for t in (await s.list_tools()).tools])
print("resources :", [r.name for r in (await s.list_resources()).resources])
print("templates :", [t.name for t in (await s.list_resource_templates()).resource_templates])
async def test_crawl(s: ClientSession) -> None:
"""Hit the @mcp_tool('crawl') endpoint."""
res = await s.call_tool(
"crawl",
{
"urls": ["https://example.com"],
"browser_config": {},
"crawler_config": {},
},
)
print("crawl →", json.loads(res.content[0].text))
async def test_md(s: ClientSession) -> None:
"""Hit the @mcp_tool('md') endpoint."""
res = await s.call_tool(
"md",
{
"url": "https://example.com",
"f": "fit", # or RAW, BM25, LLM
"q": None,
"c": "0",
},
)
result = json.loads(res.content[0].text)
print("md →", result['markdown'][:100], "...")
async def test_screenshot(s: ClientSession):
res = await s.call_tool(
"screenshot",
{
"url": "https://example.com",
"screenshot_wait_for": 1.0,
},
)
png_b64 = json.loads(res.content[0].text)["screenshot"]
print("screenshot →", png_b64[:60], "… (base64)")
async def test_pdf(s: ClientSession):
res = await s.call_tool(
"pdf",
{
"url": "https://example.com",
},
)
pdf_b64 = json.loads(res.content[0].text)["pdf"]
print("pdf →", pdf_b64[:60], "… (base64)")
async def test_execute_js(s: ClientSession):
# click the “More” link on Hacker News front page and wait 1 s
res = await s.call_tool(
"execute_js",
{
"url": "https://news.ycombinator.com/news",
"js_code": [
"await page.click('a.morelink')",
"await page.waitForTimeout(1000)",
],
},
)
crawl_result = json.loads(res.content[0].text)
print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
async def test_html(s: ClientSession):
# click the “More” link on Hacker News front page and wait 1 s
res = await s.call_tool(
"html",
{
"url": "https://news.ycombinator.com/news",
},
)
crawl_result = json.loads(res.content[0].text)
print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
async def test_context(s: ClientSession):
# click the “More” link on Hacker News front page and wait 1 s
res = await s.call_tool(
"ask",
{
"query": "I hv a question about Crawl4ai library, how to extract internal links when crawling a page?"
},
)
crawl_result = json.loads(res.content[0].text)
print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
async def main() -> None:
async with websocket_client("ws://localhost:11235/mcp/ws") as (r, w):
async with ClientSession(r, w) as s:
await s.initialize() # handshake
tools = (await s.list_tools()).tools
print("tools:", [t.name for t in tools])
# await test_list()
await test_crawl(s)
await test_md(s)
await test_screenshot(s)
await test_pdf(s)
await test_execute_js(s)
await test_html(s)
await test_context(s)
anyio.run(main)

11
tests/mcp/test_mcp_sse.py Normal file
View File

@@ -0,0 +1,11 @@
from mcp.client.sse import sse_client
from mcp.client.session import ClientSession
async def main():
async with sse_client("http://127.0.0.1:8020/mcp") as (r, w):
async with ClientSession(r, w) as sess:
print(await sess.list_tools()) # now works
if __name__ == "__main__":
import asyncio
asyncio.run(main())

315
tests/memory/README.md Normal file
View File

@@ -0,0 +1,315 @@
# Crawl4AI Stress Testing and Benchmarking
This directory contains tools for stress testing Crawl4AI's `arun_many` method and dispatcher system with high volumes of URLs to evaluate performance, concurrency handling, and potentially detect memory issues. It also includes a benchmarking system to track performance over time.
## Quick Start
```bash
# Run a default stress test (small config) and generate a report
# (Assumes run_all.sh is updated to call run_benchmark.py)
./run_all.sh
```
*Note: `run_all.sh` might need to be updated if it directly called the old script.*
## Overview
The stress testing system works by:
1. Generating a local test site with heavy HTML pages (regenerated by default for each test).
2. Starting a local HTTP server to serve these pages.
3. Running Crawl4AI's `arun_many` method against this local site using the `MemoryAdaptiveDispatcher` with configurable concurrency (`max_sessions`).
4. Monitoring performance metrics via the `CrawlerMonitor` and optionally logging memory usage.
5. Optionally generating detailed benchmark reports with visualizations using `benchmark_report.py`.
## Available Tools
- `test_stress_sdk.py` - Main stress testing script utilizing `arun_many` and dispatchers.
- `benchmark_report.py` - Report generator for comparing test results (assumes compatibility with `test_stress_sdk.py` outputs).
- `run_benchmark.py` - Python script with predefined test configurations that orchestrates tests using `test_stress_sdk.py`.
- `run_all.sh` - Simple wrapper script (may need updating).
## Usage Guide
### Using Predefined Configurations (Recommended)
The `run_benchmark.py` script offers the easiest way to run standardized tests:
```bash
# Quick test (50 URLs, 4 max sessions)
python run_benchmark.py quick
# Medium test (500 URLs, 16 max sessions)
python run_benchmark.py medium
# Large test (1000 URLs, 32 max sessions)
python run_benchmark.py large
# Extreme test (2000 URLs, 64 max sessions)
python run_benchmark.py extreme
# Custom configuration
python run_benchmark.py custom --urls 300 --max-sessions 24 --chunk-size 50
# Run 'small' test in streaming mode
python run_benchmark.py small --stream
# Override max_sessions for the 'medium' config
python run_benchmark.py medium --max-sessions 20
# Skip benchmark report generation after the test
python run_benchmark.py small --no-report
# Clean up reports and site files before running
python run_benchmark.py medium --clean
```
#### `run_benchmark.py` Parameters
| Parameter | Default | Description |
| -------------------- | --------------- | --------------------------------------------------------------------------- |
| `config` | *required* | Test configuration: `quick`, `small`, `medium`, `large`, `extreme`, `custom`|
| `--urls` | config-specific | Number of URLs (required for `custom`) |
| `--max-sessions` | config-specific | Max concurrent sessions managed by dispatcher (required for `custom`) |
| `--chunk-size` | config-specific | URLs per batch for non-stream logging (required for `custom`) |
| `--stream` | False | Enable streaming results (disables batch logging) |
| `--monitor-mode` | DETAILED | `DETAILED` or `AGGREGATED` display for the live monitor |
| `--use-rate-limiter` | False | Enable basic rate limiter in the dispatcher |
| `--port` | 8000 | HTTP server port |
| `--no-report` | False | Skip generating comparison report via `benchmark_report.py` |
| `--clean` | False | Clean up reports and site files before running |
| `--keep-server-alive`| False | Keep local HTTP server running after test |
| `--use-existing-site`| False | Use existing site on specified port (no local server start/site gen) |
| `--skip-generation` | False | Use existing site files but start local server |
| `--keep-site` | False | Keep generated site files after test |
#### Predefined Configurations
| Configuration | URLs | Max Sessions | Chunk Size | Description |
| ------------- | ------ | ------------ | ---------- | -------------------------------- |
| `quick` | 50 | 4 | 10 | Quick test for basic validation |
| `small` | 100 | 8 | 20 | Small test for routine checks |
| `medium` | 500 | 16 | 50 | Medium test for thorough checks |
| `large` | 1000 | 32 | 100 | Large test for stress testing |
| `extreme` | 2000 | 64 | 200 | Extreme test for limit testing |
### Direct Usage of `test_stress_sdk.py`
For fine-grained control or debugging, you can run the stress test script directly:
```bash
# Test with 200 URLs and 32 max concurrent sessions
python test_stress_sdk.py --urls 200 --max-sessions 32 --chunk-size 40
# Clean up previous test data first
python test_stress_sdk.py --clean-reports --clean-site --urls 100 --max-sessions 16 --chunk-size 20
# Change the HTTP server port and use aggregated monitor
python test_stress_sdk.py --port 8088 --urls 100 --max-sessions 16 --monitor-mode AGGREGATED
# Enable streaming mode and use rate limiting
python test_stress_sdk.py --urls 50 --max-sessions 8 --stream --use-rate-limiter
# Change report output location
python test_stress_sdk.py --report-path custom_reports --urls 100 --max-sessions 16
```
#### `test_stress_sdk.py` Parameters
| Parameter | Default | Description |
| -------------------- | ---------- | -------------------------------------------------------------------- |
| `--urls` | 100 | Number of URLs to test |
| `--max-sessions` | 16 | Maximum concurrent crawling sessions managed by the dispatcher |
| `--chunk-size` | 10 | Number of URLs per batch (relevant for non-stream logging) |
| `--stream` | False | Enable streaming results (disables batch logging) |
| `--monitor-mode` | DETAILED | `DETAILED` or `AGGREGATED` display for the live `CrawlerMonitor` |
| `--use-rate-limiter` | False | Enable a basic `RateLimiter` within the dispatcher |
| `--site-path` | "test_site"| Path to store/use the generated test site |
| `--port` | 8000 | Port for the local HTTP server |
| `--report-path` | "reports" | Path to save test result summary (JSON) and memory samples (CSV) |
| `--skip-generation` | False | Use existing test site files but still start local server |
| `--use-existing-site`| False | Use existing site on specified port (no local server/site gen) |
| `--keep-server-alive`| False | Keep local HTTP server running after test completion |
| `--keep-site` | False | Keep the generated test site files after test completion |
| `--clean-reports` | False | Clean up report directory before running |
| `--clean-site` | False | Clean up site directory before/after running (see script logic) |
### Generating Reports Only
If you only want to generate a benchmark report from existing test results (assuming `benchmark_report.py` is compatible):
```bash
# Generate a report from existing test results in ./reports/
python benchmark_report.py
# Limit to the most recent 5 test results
python benchmark_report.py --limit 5
# Specify a custom source directory for test results
python benchmark_report.py --reports-dir alternate_results
```
#### `benchmark_report.py` Parameters (Assumed)
| Parameter | Default | Description |
| --------------- | -------------------- | ----------------------------------------------------------- |
| `--reports-dir` | "reports" | Directory containing `test_stress_sdk.py` result files |
| `--output-dir` | "benchmark_reports" | Directory to save generated HTML reports and charts |
| `--limit` | None (all results) | Limit comparison to N most recent test results |
| `--output-file` | Auto-generated | Custom output filename for the HTML report |
## Understanding the Test Output
### Real-time Progress Display (`CrawlerMonitor`)
When running `test_stress_sdk.py`, the `CrawlerMonitor` provides a live view of the crawling process managed by the dispatcher.
- **DETAILED Mode (Default):** Shows individual task status (Queued, Active, Completed, Failed), timings, memory usage per task (if `psutil` is available), overall queue statistics, and memory pressure status (if `psutil` available).
- **AGGREGATED Mode:** Shows summary counts (Queued, Active, Completed, Failed), overall progress percentage, estimated time remaining, average URLs/sec, and memory pressure status.
### Batch Log Output (Non-Streaming Mode Only)
If running `test_stress_sdk.py` **without** the `--stream` flag, you will *also* see per-batch summary lines printed to the console *after* the monitor display, once each chunk of URLs finishes processing:
```
Batch | Progress | Start Mem | End Mem | URLs/sec | Success/Fail | Time (s) | Status
───────────────────────────────────────────────────────────────────────────────────────────
1 | 10.0% | 50.1 MB | 55.3 MB | 23.8 | 10/0 | 0.42 | Success
2 | 20.0% | 55.3 MB | 60.1 MB | 24.1 | 10/0 | 0.41 | Success
...
```
This display provides chunk-specific metrics:
- **Batch**: The batch number being reported.
- **Progress**: Overall percentage of total URLs processed *after* this batch.
- **Start Mem / End Mem**: Memory usage before and after processing this batch (if tracked).
- **URLs/sec**: Processing speed *for this specific batch*.
- **Success/Fail**: Number of successful and failed URLs *in this batch*.
- **Time (s)**: Wall-clock time taken to process *this batch*.
- **Status**: Color-coded status for the batch outcome.
### Summary Output
After test completion, a final summary is displayed:
```
================================================================================
Test Completed
================================================================================
Test ID: 20250418_103015
Configuration: 100 URLs, 16 max sessions, Chunk: 10, Stream: False, Monitor: DETAILED
Results: 100 successful, 0 failed (100 processed, 100.0% success)
Performance: 5.85 seconds total, 17.09 URLs/second avg
Memory Usage: Start: 50.1 MB, End: 75.3 MB, Max: 78.1 MB, Growth: 25.2 MB
Results summary saved to reports/test_summary_20250418_103015.json
```
### HTML Report Structure (Generated by `benchmark_report.py`)
(This section remains the same, assuming `benchmark_report.py` generates these)
The benchmark report contains several sections:
1. **Summary**: Overview of the latest test results and trends
2. **Performance Comparison**: Charts showing throughput across tests
3. **Memory Usage**: Detailed memory usage graphs for each test
4. **Detailed Results**: Tabular data of all test metrics
5. **Conclusion**: Automated analysis of performance and memory patterns
### Memory Metrics
(This section remains conceptually the same)
Memory growth is the key metric for detecting leaks...
### Performance Metrics
(This section remains conceptually the same, though "URLs per Worker" is less relevant - focus on overall URLs/sec)
Key performance indicators include:
- **URLs per Second**: Higher is better (throughput)
- **Success Rate**: Should be 100% in normal conditions
- **Total Processing Time**: Lower is better
- **Dispatcher Efficiency**: Observe queue lengths and wait times in the monitor (Detailed mode)
### Raw Data Files
Raw data is saved in the `--report-path` directory (default `./reports/`):
- **JSON files** (`test_summary_*.json`): Contains the final summary for each test run.
- **CSV files** (`memory_samples_*.csv`): Contains time-series memory samples taken during the test run.
Example of reading raw data:
```python
import json
import pandas as pd
# Load test summary
test_id = "20250418_103015" # Example ID
with open(f'reports/test_summary_{test_id}.json', 'r') as f:
results = json.load(f)
# Load memory samples
memory_df = pd.read_csv(f'reports/memory_samples_{test_id}.csv')
# Analyze memory_df (e.g., calculate growth, plot)
if not memory_df['memory_info_mb'].isnull().all():
growth = memory_df['memory_info_mb'].iloc[-1] - memory_df['memory_info_mb'].iloc[0]
print(f"Total Memory Growth: {growth:.1f} MB")
else:
print("No valid memory samples found.")
print(f"Avg URLs/sec: {results['urls_processed'] / results['total_time_seconds']:.2f}")
```
## Visualization Dependencies
(This section remains the same)
For full visualization capabilities in the HTML reports generated by `benchmark_report.py`, install additional dependencies...
## Directory Structure
```
benchmarking/ # Or your top-level directory name
├── benchmark_reports/ # Generated HTML reports (by benchmark_report.py)
├── reports/ # Raw test result data (from test_stress_sdk.py)
├── test_site/ # Generated test content (temporary)
├── benchmark_report.py# Report generator
├── run_benchmark.py # Test runner with predefined configs
├── test_stress_sdk.py # Main stress test implementation using arun_many
└── run_all.sh # Simple wrapper script (may need updates)
#└── requirements.txt # Optional: Visualization dependencies for benchmark_report.py
```
## Cleanup
To clean up after testing:
```bash
# Remove the test site content (if not using --keep-site)
rm -rf test_site
# Remove all raw reports and generated benchmark reports
rm -rf reports benchmark_reports
# Or use the --clean flag with run_benchmark.py
python run_benchmark.py medium --clean
```
## Use in CI/CD
(This section remains conceptually the same, just update script names)
These tests can be integrated into CI/CD pipelines:
```bash
# Example CI script
python run_benchmark.py medium --no-report # Run test without interactive report gen
# Check exit code
if [ $? -ne 0 ]; then echo "Stress test failed!"; exit 1; fi
# Optionally, run report generator and check its output/metrics
# python benchmark_report.py
# check_report_metrics.py reports/test_summary_*.json || exit 1
exit 0
```
## Troubleshooting
- **HTTP Server Port Conflict**: Use `--port` with `run_benchmark.py` or `test_stress_sdk.py`.
- **Memory Tracking Issues**: The `SimpleMemoryTracker` uses platform commands (`ps`, `/proc`, `tasklist`). Ensure these are available and the script has permission. If it consistently fails, memory reporting will be limited.
- **Visualization Missing**: Related to `benchmark_report.py` and its dependencies.
- **Site Generation Issues**: Check permissions for creating `./test_site/`. Use `--skip-generation` if you want to manage the site manually.
- **Testing Against External Site**: Ensure the external site is running and use `--use-existing-site --port <correct_port>`.

887
tests/memory/benchmark_report.py Executable file
View File

@@ -0,0 +1,887 @@
#!/usr/bin/env python3
"""
Benchmark reporting tool for Crawl4AI stress tests.
Generates visual reports and comparisons between test runs.
"""
import os
import json
import glob
import argparse
import sys
from datetime import datetime
from pathlib import Path
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
# Initialize rich console
console = Console()
# Try to import optional visualization dependencies
VISUALIZATION_AVAILABLE = True
try:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import seaborn as sns
except ImportError:
VISUALIZATION_AVAILABLE = False
console.print("[yellow]Warning: Visualization dependencies not found. Install with:[/yellow]")
console.print("[yellow]pip install pandas matplotlib seaborn[/yellow]")
console.print("[yellow]Only text-based reports will be generated.[/yellow]")
# Configure plotting if available
if VISUALIZATION_AVAILABLE:
# Set plot style for dark theme
plt.style.use('dark_background')
sns.set_theme(style="darkgrid")
# Custom color palette based on Nord theme
nord_palette = ["#88c0d0", "#81a1c1", "#a3be8c", "#ebcb8b", "#bf616a", "#b48ead", "#5e81ac"]
sns.set_palette(nord_palette)
class BenchmarkReporter:
"""Generates visual reports and comparisons for Crawl4AI stress tests."""
def __init__(self, reports_dir="reports", output_dir="benchmark_reports"):
"""Initialize the benchmark reporter.
Args:
reports_dir: Directory containing test result files
output_dir: Directory to save generated reports
"""
self.reports_dir = Path(reports_dir)
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
# Configure matplotlib if available
if VISUALIZATION_AVAILABLE:
# Ensure the matplotlib backend works in headless environments
mpl.use('Agg')
# Set up styling for plots with dark theme
mpl.rcParams['figure.figsize'] = (12, 8)
mpl.rcParams['font.size'] = 12
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['legend.fontsize'] = 12
mpl.rcParams['figure.facecolor'] = '#1e1e1e'
mpl.rcParams['axes.facecolor'] = '#2e3440'
mpl.rcParams['savefig.facecolor'] = '#1e1e1e'
mpl.rcParams['text.color'] = '#e0e0e0'
mpl.rcParams['axes.labelcolor'] = '#e0e0e0'
mpl.rcParams['xtick.color'] = '#e0e0e0'
mpl.rcParams['ytick.color'] = '#e0e0e0'
mpl.rcParams['grid.color'] = '#444444'
mpl.rcParams['figure.edgecolor'] = '#444444'
def load_test_results(self, limit=None):
"""Load all test results from the reports directory.
Args:
limit: Optional limit on number of most recent tests to load
Returns:
Dictionary mapping test IDs to result data
"""
result_files = glob.glob(str(self.reports_dir / "test_results_*.json"))
# Sort files by modification time (newest first)
result_files.sort(key=os.path.getmtime, reverse=True)
if limit:
result_files = result_files[:limit]
results = {}
for file_path in result_files:
try:
with open(file_path, 'r') as f:
data = json.load(f)
test_id = data.get('test_id')
if test_id:
results[test_id] = data
# Try to load the corresponding memory samples
csv_path = self.reports_dir / f"memory_samples_{test_id}.csv"
if csv_path.exists():
try:
memory_df = pd.read_csv(csv_path)
results[test_id]['memory_samples'] = memory_df
except Exception as e:
console.print(f"[yellow]Warning: Could not load memory samples for {test_id}: {e}[/yellow]")
except Exception as e:
console.print(f"[red]Error loading {file_path}: {e}[/red]")
console.print(f"Loaded {len(results)} test results")
return results
def generate_summary_table(self, results):
"""Generate a summary table of test results.
Args:
results: Dictionary mapping test IDs to result data
Returns:
Rich Table object
"""
table = Table(title="Crawl4AI Stress Test Summary", show_header=True)
# Define columns
table.add_column("Test ID", style="cyan")
table.add_column("Date", style="bright_green")
table.add_column("URLs", justify="right")
table.add_column("Workers", justify="right")
table.add_column("Success %", justify="right")
table.add_column("Time (s)", justify="right")
table.add_column("Mem Growth", justify="right")
table.add_column("URLs/sec", justify="right")
# Add rows
for test_id, data in sorted(results.items(), key=lambda x: x[0], reverse=True):
# Parse timestamp from test_id
try:
date_str = datetime.strptime(test_id, "%Y%m%d_%H%M%S").strftime("%Y-%m-%d %H:%M")
except:
date_str = "Unknown"
# Calculate success percentage
total_urls = data.get('url_count', 0)
successful = data.get('successful_urls', 0)
success_pct = (successful / total_urls * 100) if total_urls > 0 else 0
# Calculate memory growth if available
mem_growth = "N/A"
if 'memory_samples' in data:
samples = data['memory_samples']
if len(samples) >= 2:
# Try to extract numeric values from memory_info strings
try:
first_mem = float(samples.iloc[0]['memory_info'].split()[0])
last_mem = float(samples.iloc[-1]['memory_info'].split()[0])
mem_growth = f"{last_mem - first_mem:.1f} MB"
except:
pass
# Calculate URLs per second
time_taken = data.get('total_time_seconds', 0)
urls_per_sec = total_urls / time_taken if time_taken > 0 else 0
table.add_row(
test_id,
date_str,
str(total_urls),
str(data.get('workers', 'N/A')),
f"{success_pct:.1f}%",
f"{data.get('total_time_seconds', 0):.2f}",
mem_growth,
f"{urls_per_sec:.1f}"
)
return table
def generate_performance_chart(self, results, output_file=None):
"""Generate a performance comparison chart.
Args:
results: Dictionary mapping test IDs to result data
output_file: File path to save the chart
Returns:
Path to the saved chart file or None if visualization is not available
"""
if not VISUALIZATION_AVAILABLE:
console.print("[yellow]Skipping performance chart - visualization dependencies not available[/yellow]")
return None
# Extract relevant data
data = []
for test_id, result in results.items():
urls = result.get('url_count', 0)
workers = result.get('workers', 0)
time_taken = result.get('total_time_seconds', 0)
urls_per_sec = urls / time_taken if time_taken > 0 else 0
# Parse timestamp from test_id for sorting
try:
timestamp = datetime.strptime(test_id, "%Y%m%d_%H%M%S")
data.append({
'test_id': test_id,
'timestamp': timestamp,
'urls': urls,
'workers': workers,
'time_seconds': time_taken,
'urls_per_sec': urls_per_sec
})
except:
console.print(f"[yellow]Warning: Could not parse timestamp from {test_id}[/yellow]")
if not data:
console.print("[yellow]No valid data for performance chart[/yellow]")
return None
# Convert to DataFrame and sort by timestamp
df = pd.DataFrame(data)
df = df.sort_values('timestamp')
# Create the plot
fig, ax1 = plt.subplots(figsize=(12, 6))
# Plot URLs per second as bars with properly set x-axis
x_pos = range(len(df['test_id']))
bars = ax1.bar(x_pos, df['urls_per_sec'], color='#88c0d0', alpha=0.8)
ax1.set_ylabel('URLs per Second', color='#88c0d0')
ax1.tick_params(axis='y', labelcolor='#88c0d0')
# Properly set x-axis labels
ax1.set_xticks(x_pos)
ax1.set_xticklabels(df['test_id'].tolist(), rotation=45, ha='right')
# Add worker count as text on each bar
for i, bar in enumerate(bars):
height = bar.get_height()
workers = df.iloc[i]['workers']
ax1.text(i, height + 0.1,
f'W: {workers}', ha='center', va='bottom', fontsize=9, color='#e0e0e0')
# Add a second y-axis for total URLs
ax2 = ax1.twinx()
ax2.plot(x_pos, df['urls'], '-', color='#bf616a', alpha=0.8, markersize=6, marker='o')
ax2.set_ylabel('Total URLs', color='#bf616a')
ax2.tick_params(axis='y', labelcolor='#bf616a')
# Set title and layout
plt.title('Crawl4AI Performance Benchmarks')
plt.tight_layout()
# Save the figure
if output_file is None:
output_file = self.output_dir / "performance_comparison.png"
plt.savefig(output_file, dpi=100, bbox_inches='tight')
plt.close()
return output_file
def generate_memory_charts(self, results, output_prefix=None):
"""Generate memory usage charts for each test.
Args:
results: Dictionary mapping test IDs to result data
output_prefix: Prefix for output file names
Returns:
List of paths to the saved chart files
"""
if not VISUALIZATION_AVAILABLE:
console.print("[yellow]Skipping memory charts - visualization dependencies not available[/yellow]")
return []
output_files = []
for test_id, result in results.items():
if 'memory_samples' not in result:
continue
memory_df = result['memory_samples']
# Check if we have enough data points
if len(memory_df) < 2:
continue
# Try to extract numeric values from memory_info strings
try:
memory_values = []
for mem_str in memory_df['memory_info']:
# Extract the number from strings like "142.8 MB"
value = float(mem_str.split()[0])
memory_values.append(value)
memory_df['memory_mb'] = memory_values
except Exception as e:
console.print(f"[yellow]Could not parse memory values for {test_id}: {e}[/yellow]")
continue
# Create the plot
plt.figure(figsize=(10, 6))
# Plot memory usage over time
plt.plot(memory_df['elapsed_seconds'], memory_df['memory_mb'],
color='#88c0d0', marker='o', linewidth=2, markersize=4)
# Add annotations for chunk processing
chunk_size = result.get('chunk_size', 0)
url_count = result.get('url_count', 0)
if chunk_size > 0 and url_count > 0:
# Estimate chunk processing times
num_chunks = (url_count + chunk_size - 1) // chunk_size # Ceiling division
total_time = result.get('total_time_seconds', memory_df['elapsed_seconds'].max())
chunk_times = np.linspace(0, total_time, num_chunks + 1)[1:]
for i, time_point in enumerate(chunk_times):
if time_point <= memory_df['elapsed_seconds'].max():
plt.axvline(x=time_point, color='#4c566a', linestyle='--', alpha=0.6)
plt.text(time_point, memory_df['memory_mb'].min(), f'Chunk {i+1}',
rotation=90, verticalalignment='bottom', fontsize=8, color='#e0e0e0')
# Set labels and title
plt.xlabel('Elapsed Time (seconds)', color='#e0e0e0')
plt.ylabel('Memory Usage (MB)', color='#e0e0e0')
plt.title(f'Memory Usage During Test {test_id}\n({url_count} URLs, {result.get("workers", "?")} Workers)',
color='#e0e0e0')
# Add grid and set y-axis to start from zero
plt.grid(True, alpha=0.3, color='#4c566a')
# Add test metadata as text
info_text = (
f"URLs: {url_count}\n"
f"Workers: {result.get('workers', 'N/A')}\n"
f"Chunk Size: {result.get('chunk_size', 'N/A')}\n"
f"Total Time: {result.get('total_time_seconds', 0):.2f}s\n"
)
# Calculate memory growth
if len(memory_df) >= 2:
first_mem = memory_df.iloc[0]['memory_mb']
last_mem = memory_df.iloc[-1]['memory_mb']
growth = last_mem - first_mem
growth_rate = growth / result.get('total_time_seconds', 1)
info_text += f"Memory Growth: {growth:.1f} MB\n"
info_text += f"Growth Rate: {growth_rate:.2f} MB/s"
plt.figtext(0.02, 0.02, info_text, fontsize=9, color='#e0e0e0',
bbox=dict(facecolor='#3b4252', alpha=0.8, edgecolor='#4c566a'))
# Save the figure
if output_prefix is None:
output_file = self.output_dir / f"memory_chart_{test_id}.png"
else:
output_file = Path(f"{output_prefix}_memory_{test_id}.png")
plt.tight_layout()
plt.savefig(output_file, dpi=100, bbox_inches='tight')
plt.close()
output_files.append(output_file)
return output_files
def generate_comparison_report(self, results, title=None, output_file=None):
"""Generate a comprehensive comparison report of multiple test runs.
Args:
results: Dictionary mapping test IDs to result data
title: Optional title for the report
output_file: File path to save the report
Returns:
Path to the saved report file
"""
if not results:
console.print("[yellow]No results to generate comparison report[/yellow]")
return None
if output_file is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = self.output_dir / f"comparison_report_{timestamp}.html"
# Create data for the report
rows = []
for test_id, data in results.items():
# Calculate metrics
urls = data.get('url_count', 0)
workers = data.get('workers', 0)
successful = data.get('successful_urls', 0)
failed = data.get('failed_urls', 0)
time_seconds = data.get('total_time_seconds', 0)
# Calculate additional metrics
success_rate = (successful / urls) * 100 if urls > 0 else 0
urls_per_second = urls / time_seconds if time_seconds > 0 else 0
urls_per_worker = urls / workers if workers > 0 else 0
# Calculate memory growth if available
mem_start = None
mem_end = None
mem_growth = None
if 'memory_samples' in data:
samples = data['memory_samples']
if len(samples) >= 2:
try:
first_mem = float(samples.iloc[0]['memory_info'].split()[0])
last_mem = float(samples.iloc[-1]['memory_info'].split()[0])
mem_start = first_mem
mem_end = last_mem
mem_growth = last_mem - first_mem
except:
pass
# Parse timestamp from test_id
try:
timestamp = datetime.strptime(test_id, "%Y%m%d_%H%M%S")
except:
timestamp = None
rows.append({
'test_id': test_id,
'timestamp': timestamp,
'date': timestamp.strftime("%Y-%m-%d %H:%M:%S") if timestamp else "Unknown",
'urls': urls,
'workers': workers,
'chunk_size': data.get('chunk_size', 0),
'successful': successful,
'failed': failed,
'success_rate': success_rate,
'time_seconds': time_seconds,
'urls_per_second': urls_per_second,
'urls_per_worker': urls_per_worker,
'memory_start': mem_start,
'memory_end': mem_end,
'memory_growth': mem_growth
})
# Sort data by timestamp if possible
if VISUALIZATION_AVAILABLE:
# Convert to DataFrame and sort by timestamp
df = pd.DataFrame(rows)
if 'timestamp' in df.columns and not df['timestamp'].isna().all():
df = df.sort_values('timestamp', ascending=False)
else:
# Simple sorting without pandas
rows.sort(key=lambda x: x.get('timestamp', datetime.now()), reverse=True)
df = None
# Generate HTML report
html = []
html.append('<!DOCTYPE html>')
html.append('<html lang="en">')
html.append('<head>')
html.append('<meta charset="UTF-8">')
html.append('<meta name="viewport" content="width=device-width, initial-scale=1.0">')
html.append(f'<title>{title or "Crawl4AI Benchmark Comparison"}</title>')
html.append('<style>')
html.append('''
body {
font-family: Arial, sans-serif;
line-height: 1.6;
margin: 0;
padding: 20px;
max-width: 1200px;
margin: 0 auto;
color: #e0e0e0;
background-color: #1e1e1e;
}
h1, h2, h3 {
color: #81a1c1;
}
table {
border-collapse: collapse;
width: 100%;
margin-bottom: 20px;
}
th, td {
text-align: left;
padding: 12px;
border-bottom: 1px solid #444;
}
th {
background-color: #2e3440;
font-weight: bold;
}
tr:hover {
background-color: #2e3440;
}
a {
color: #88c0d0;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
.chart-container {
margin: 30px 0;
text-align: center;
background-color: #2e3440;
padding: 20px;
border-radius: 8px;
}
.chart-container img {
max-width: 100%;
height: auto;
border: 1px solid #444;
box-shadow: 0 0 10px rgba(0,0,0,0.3);
}
.card {
border: 1px solid #444;
border-radius: 8px;
padding: 15px;
margin-bottom: 20px;
background-color: #2e3440;
box-shadow: 0 0 10px rgba(0,0,0,0.2);
}
.highlight {
background-color: #3b4252;
font-weight: bold;
}
.status-good {
color: #a3be8c;
}
.status-warning {
color: #ebcb8b;
}
.status-bad {
color: #bf616a;
}
''')
html.append('</style>')
html.append('</head>')
html.append('<body>')
# Header
html.append(f'<h1>{title or "Crawl4AI Benchmark Comparison"}</h1>')
html.append(f'<p>Report generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>')
# Summary section
html.append('<div class="card">')
html.append('<h2>Summary</h2>')
html.append('<p>This report compares the performance of Crawl4AI across multiple test runs.</p>')
# Summary metrics
data_available = (VISUALIZATION_AVAILABLE and df is not None and not df.empty) or (not VISUALIZATION_AVAILABLE and len(rows) > 0)
if data_available:
# Get the latest test data
if VISUALIZATION_AVAILABLE and df is not None and not df.empty:
latest_test = df.iloc[0]
latest_id = latest_test['test_id']
else:
latest_test = rows[0] # First row (already sorted by timestamp)
latest_id = latest_test['test_id']
html.append('<h3>Latest Test Results</h3>')
html.append('<ul>')
html.append(f'<li><strong>Test ID:</strong> {latest_id}</li>')
html.append(f'<li><strong>Date:</strong> {latest_test["date"]}</li>')
html.append(f'<li><strong>URLs:</strong> {latest_test["urls"]}</li>')
html.append(f'<li><strong>Workers:</strong> {latest_test["workers"]}</li>')
html.append(f'<li><strong>Success Rate:</strong> {latest_test["success_rate"]:.1f}%</li>')
html.append(f'<li><strong>Time:</strong> {latest_test["time_seconds"]:.2f} seconds</li>')
html.append(f'<li><strong>Performance:</strong> {latest_test["urls_per_second"]:.1f} URLs/second</li>')
# Check memory growth (handle both pandas and dict mode)
memory_growth_available = False
if VISUALIZATION_AVAILABLE and df is not None:
if pd.notna(latest_test["memory_growth"]):
html.append(f'<li><strong>Memory Growth:</strong> {latest_test["memory_growth"]:.1f} MB</li>')
memory_growth_available = True
else:
if latest_test["memory_growth"] is not None:
html.append(f'<li><strong>Memory Growth:</strong> {latest_test["memory_growth"]:.1f} MB</li>')
memory_growth_available = True
html.append('</ul>')
# If we have more than one test, show trend
if (VISUALIZATION_AVAILABLE and df is not None and len(df) > 1) or (not VISUALIZATION_AVAILABLE and len(rows) > 1):
if VISUALIZATION_AVAILABLE and df is not None:
prev_test = df.iloc[1]
else:
prev_test = rows[1]
# Calculate performance change
perf_change = ((latest_test["urls_per_second"] / prev_test["urls_per_second"]) - 1) * 100 if prev_test["urls_per_second"] > 0 else 0
status_class = ""
if perf_change > 5:
status_class = "status-good"
elif perf_change < -5:
status_class = "status-bad"
html.append('<h3>Performance Trend</h3>')
html.append('<ul>')
html.append(f'<li><strong>Performance Change:</strong> <span class="{status_class}">{perf_change:+.1f}%</span> compared to previous test</li>')
# Memory trend if available
memory_trend_available = False
if VISUALIZATION_AVAILABLE and df is not None:
if pd.notna(latest_test["memory_growth"]) and pd.notna(prev_test["memory_growth"]):
mem_change = latest_test["memory_growth"] - prev_test["memory_growth"]
memory_trend_available = True
else:
if latest_test["memory_growth"] is not None and prev_test["memory_growth"] is not None:
mem_change = latest_test["memory_growth"] - prev_test["memory_growth"]
memory_trend_available = True
if memory_trend_available:
mem_status = ""
if mem_change < -1: # Improved (less growth)
mem_status = "status-good"
elif mem_change > 1: # Worse (more growth)
mem_status = "status-bad"
html.append(f'<li><strong>Memory Trend:</strong> <span class="{mem_status}">{mem_change:+.1f} MB</span> change in memory growth</li>')
html.append('</ul>')
html.append('</div>')
# Generate performance chart if visualization is available
if VISUALIZATION_AVAILABLE:
perf_chart = self.generate_performance_chart(results)
if perf_chart:
html.append('<div class="chart-container">')
html.append('<h2>Performance Comparison</h2>')
html.append(f'<img src="{os.path.relpath(perf_chart, os.path.dirname(output_file))}" alt="Performance Comparison Chart">')
html.append('</div>')
else:
html.append('<div class="chart-container">')
html.append('<h2>Performance Comparison</h2>')
html.append('<p>Charts not available - install visualization dependencies (pandas, matplotlib, seaborn) to enable.</p>')
html.append('</div>')
# Generate memory charts if visualization is available
if VISUALIZATION_AVAILABLE:
memory_charts = self.generate_memory_charts(results)
if memory_charts:
html.append('<div class="chart-container">')
html.append('<h2>Memory Usage</h2>')
for chart in memory_charts:
test_id = chart.stem.split('_')[-1]
html.append(f'<h3>Test {test_id}</h3>')
html.append(f'<img src="{os.path.relpath(chart, os.path.dirname(output_file))}" alt="Memory Chart for {test_id}">')
html.append('</div>')
else:
html.append('<div class="chart-container">')
html.append('<h2>Memory Usage</h2>')
html.append('<p>Charts not available - install visualization dependencies (pandas, matplotlib, seaborn) to enable.</p>')
html.append('</div>')
# Detailed results table
html.append('<h2>Detailed Results</h2>')
# Add the results as an HTML table
html.append('<table>')
# Table headers
html.append('<tr>')
for col in ['Test ID', 'Date', 'URLs', 'Workers', 'Success %', 'Time (s)', 'URLs/sec', 'Mem Growth (MB)']:
html.append(f'<th>{col}</th>')
html.append('</tr>')
# Table rows - handle both pandas DataFrame and list of dicts
if VISUALIZATION_AVAILABLE and df is not None:
# Using pandas DataFrame
for _, row in df.iterrows():
html.append('<tr>')
html.append(f'<td>{row["test_id"]}</td>')
html.append(f'<td>{row["date"]}</td>')
html.append(f'<td>{row["urls"]}</td>')
html.append(f'<td>{row["workers"]}</td>')
html.append(f'<td>{row["success_rate"]:.1f}%</td>')
html.append(f'<td>{row["time_seconds"]:.2f}</td>')
html.append(f'<td>{row["urls_per_second"]:.1f}</td>')
# Memory growth cell
if pd.notna(row["memory_growth"]):
html.append(f'<td>{row["memory_growth"]:.1f}</td>')
else:
html.append('<td>N/A</td>')
html.append('</tr>')
else:
# Using list of dicts (when pandas is not available)
for row in rows:
html.append('<tr>')
html.append(f'<td>{row["test_id"]}</td>')
html.append(f'<td>{row["date"]}</td>')
html.append(f'<td>{row["urls"]}</td>')
html.append(f'<td>{row["workers"]}</td>')
html.append(f'<td>{row["success_rate"]:.1f}%</td>')
html.append(f'<td>{row["time_seconds"]:.2f}</td>')
html.append(f'<td>{row["urls_per_second"]:.1f}</td>')
# Memory growth cell
if row["memory_growth"] is not None:
html.append(f'<td>{row["memory_growth"]:.1f}</td>')
else:
html.append('<td>N/A</td>')
html.append('</tr>')
html.append('</table>')
# Conclusion section
html.append('<div class="card">')
html.append('<h2>Conclusion</h2>')
if VISUALIZATION_AVAILABLE and df is not None and not df.empty:
# Using pandas for statistics (when available)
# Calculate some overall statistics
avg_urls_per_sec = df['urls_per_second'].mean()
max_urls_per_sec = df['urls_per_second'].max()
# Determine if we have a trend
if len(df) > 1:
trend_data = df.sort_values('timestamp')
first_perf = trend_data.iloc[0]['urls_per_second']
last_perf = trend_data.iloc[-1]['urls_per_second']
perf_change = ((last_perf / first_perf) - 1) * 100 if first_perf > 0 else 0
if perf_change > 10:
trend_desc = "significantly improved"
trend_class = "status-good"
elif perf_change > 5:
trend_desc = "improved"
trend_class = "status-good"
elif perf_change < -10:
trend_desc = "significantly decreased"
trend_class = "status-bad"
elif perf_change < -5:
trend_desc = "decreased"
trend_class = "status-bad"
else:
trend_desc = "remained stable"
trend_class = ""
html.append(f'<p>Overall performance has <span class="{trend_class}">{trend_desc}</span> over the test period.</p>')
html.append(f'<p>Average throughput: <strong>{avg_urls_per_sec:.1f}</strong> URLs/second</p>')
html.append(f'<p>Maximum throughput: <strong>{max_urls_per_sec:.1f}</strong> URLs/second</p>')
# Memory leak assessment
if 'memory_growth' in df.columns and not df['memory_growth'].isna().all():
avg_growth = df['memory_growth'].mean()
max_growth = df['memory_growth'].max()
if avg_growth < 5:
leak_assessment = "No significant memory leaks detected"
leak_class = "status-good"
elif avg_growth < 10:
leak_assessment = "Minor memory growth observed"
leak_class = "status-warning"
else:
leak_assessment = "Potential memory leak detected"
leak_class = "status-bad"
html.append(f'<p><span class="{leak_class}">{leak_assessment}</span>. Average memory growth: <strong>{avg_growth:.1f} MB</strong> per test.</p>')
else:
# Manual calculations without pandas
if rows:
# Calculate average and max throughput
total_urls_per_sec = sum(row['urls_per_second'] for row in rows)
avg_urls_per_sec = total_urls_per_sec / len(rows)
max_urls_per_sec = max(row['urls_per_second'] for row in rows)
html.append(f'<p>Average throughput: <strong>{avg_urls_per_sec:.1f}</strong> URLs/second</p>')
html.append(f'<p>Maximum throughput: <strong>{max_urls_per_sec:.1f}</strong> URLs/second</p>')
# Memory assessment (simplified without pandas)
growth_values = [row['memory_growth'] for row in rows if row['memory_growth'] is not None]
if growth_values:
avg_growth = sum(growth_values) / len(growth_values)
if avg_growth < 5:
leak_assessment = "No significant memory leaks detected"
leak_class = "status-good"
elif avg_growth < 10:
leak_assessment = "Minor memory growth observed"
leak_class = "status-warning"
else:
leak_assessment = "Potential memory leak detected"
leak_class = "status-bad"
html.append(f'<p><span class="{leak_class}">{leak_assessment}</span>. Average memory growth: <strong>{avg_growth:.1f} MB</strong> per test.</p>')
else:
html.append('<p>No test data available for analysis.</p>')
html.append('</div>')
# Footer
html.append('<div style="margin-top: 30px; text-align: center; color: #777; font-size: 0.9em;">')
html.append('<p>Generated by Crawl4AI Benchmark Reporter</p>')
html.append('</div>')
html.append('</body>')
html.append('</html>')
# Write the HTML file
with open(output_file, 'w') as f:
f.write('\n'.join(html))
# Print a clickable link for terminals that support it (iTerm, VS Code, etc.)
file_url = f"file://{os.path.abspath(output_file)}"
console.print(f"[green]Comparison report saved to: {output_file}[/green]")
console.print(f"[blue underline]Click to open report: {file_url}[/blue underline]")
return output_file
def run(self, limit=None, output_file=None):
"""Generate a full benchmark report.
Args:
limit: Optional limit on number of most recent tests to include
output_file: Optional output file path
Returns:
Path to the generated report file
"""
# Load test results
results = self.load_test_results(limit=limit)
if not results:
console.print("[yellow]No test results found. Run some tests first.[/yellow]")
return None
# Generate and display summary table
summary_table = self.generate_summary_table(results)
console.print(summary_table)
# Generate comparison report
title = f"Crawl4AI Benchmark Report ({len(results)} test runs)"
report_file = self.generate_comparison_report(results, title=title, output_file=output_file)
if report_file:
console.print(f"[bold green]Report generated successfully: {report_file}[/bold green]")
return report_file
else:
console.print("[bold red]Failed to generate report[/bold red]")
return None
def main():
"""Main entry point for the benchmark reporter."""
parser = argparse.ArgumentParser(description="Generate benchmark reports for Crawl4AI stress tests")
parser.add_argument("--reports-dir", type=str, default="reports",
help="Directory containing test result files")
parser.add_argument("--output-dir", type=str, default="benchmark_reports",
help="Directory to save generated reports")
parser.add_argument("--limit", type=int, default=None,
help="Limit to most recent N test results")
parser.add_argument("--output-file", type=str, default=None,
help="Custom output file path for the report")
args = parser.parse_args()
# Create the benchmark reporter
reporter = BenchmarkReporter(reports_dir=args.reports_dir, output_dir=args.output_dir)
# Generate the report
report_file = reporter.run(limit=args.limit, output_file=args.output_file)
if report_file:
print(f"Report generated at: {report_file}")
return 0
else:
print("Failed to generate report")
return 1
if __name__ == "__main__":
import sys
sys.exit(main())

34
tests/memory/cap_test.py Normal file
View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python3
"""
Hammer /crawl with many concurrent requests to prove GLOBAL_SEM works.
"""
import asyncio, httpx, json, uuid, argparse
API = "http://localhost:8020/crawl"
URLS_PER_CALL = 1 # keep it minimal so each arun() == 1 page
CONCURRENT_CALLS = 20 # way above your cap
payload_template = {
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS", "verbose": False},
}
}
async def one_call(client):
payload = payload_template.copy()
payload["urls"] = [f"https://httpbin.org/anything/{uuid.uuid4()}"]
r = await client.post(API, json=payload)
r.raise_for_status()
return r.json()["server_peak_memory_mb"]
async def main():
async with httpx.AsyncClient(timeout=60) as client:
tasks = [asyncio.create_task(one_call(client)) for _ in range(CONCURRENT_CALLS)]
mem_usages = await asyncio.gather(*tasks)
print("Calls finished OK, server peaks reported:", mem_usages)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,4 @@
pandas>=1.5.0
matplotlib>=3.5.0
seaborn>=0.12.0
rich>=12.0.0

259
tests/memory/run_benchmark.py Executable file
View File

@@ -0,0 +1,259 @@
#!/usr/bin/env python3
"""
Run a complete Crawl4AI benchmark test using test_stress_sdk.py and generate a report.
"""
import sys
import os
import glob
import argparse
import subprocess
import time
from datetime import datetime
from rich.console import Console
from rich.text import Text
console = Console()
# Updated TEST_CONFIGS to use max_sessions
TEST_CONFIGS = {
"quick": {"urls": 50, "max_sessions": 4, "chunk_size": 10, "description": "Quick test (50 URLs, 4 sessions)"},
"small": {"urls": 100, "max_sessions": 8, "chunk_size": 20, "description": "Small test (100 URLs, 8 sessions)"},
"medium": {"urls": 500, "max_sessions": 16, "chunk_size": 50, "description": "Medium test (500 URLs, 16 sessions)"},
"large": {"urls": 1000, "max_sessions": 32, "chunk_size": 100,"description": "Large test (1000 URLs, 32 sessions)"},
"extreme": {"urls": 2000, "max_sessions": 64, "chunk_size": 200,"description": "Extreme test (2000 URLs, 64 sessions)"},
}
# Arguments to forward directly if present in custom_args
FORWARD_ARGS = {
"urls": "--urls",
"max_sessions": "--max-sessions",
"chunk_size": "--chunk-size",
"port": "--port",
"monitor_mode": "--monitor-mode",
}
# Boolean flags to forward if True
FORWARD_FLAGS = {
"stream": "--stream",
"use_rate_limiter": "--use-rate-limiter",
"keep_server_alive": "--keep-server-alive",
"use_existing_site": "--use-existing-site",
"skip_generation": "--skip-generation",
"keep_site": "--keep-site",
"clean_reports": "--clean-reports", # Note: clean behavior is handled here, but pass flag if needed
"clean_site": "--clean-site", # Note: clean behavior is handled here, but pass flag if needed
}
def run_benchmark(config_name, custom_args=None, compare=True, clean=False):
"""Runs the stress test and optionally the report generator."""
if config_name not in TEST_CONFIGS and config_name != "custom":
console.print(f"[bold red]Unknown configuration: {config_name}[/bold red]")
return False
# Print header
title = "Crawl4AI SDK Benchmark Test"
if config_name != "custom":
title += f" - {TEST_CONFIGS[config_name]['description']}"
else:
# Safely get custom args for title
urls = custom_args.get('urls', '?') if custom_args else '?'
sessions = custom_args.get('max_sessions', '?') if custom_args else '?'
title += f" - Custom ({urls} URLs, {sessions} sessions)"
console.print(f"\n[bold blue]{title}[/bold blue]")
console.print("=" * (len(title) + 4)) # Adjust underline length
console.print("\n[bold white]Preparing test...[/bold white]")
# --- Command Construction ---
# Use the new script name
cmd = ["python", "test_stress_sdk.py"]
# Apply config or custom args
args_to_use = {}
if config_name != "custom":
args_to_use = TEST_CONFIGS[config_name].copy()
# If custom args are provided (e.g., boolean flags), overlay them
if custom_args:
args_to_use.update(custom_args)
elif custom_args: # Custom config
args_to_use = custom_args.copy()
# Add arguments with values
for key, arg_name in FORWARD_ARGS.items():
if key in args_to_use:
cmd.extend([arg_name, str(args_to_use[key])])
# Add boolean flags
for key, flag_name in FORWARD_FLAGS.items():
if args_to_use.get(key, False): # Check if key exists and is True
# Special handling for clean flags - apply locally, don't forward?
# Decide if test_stress_sdk.py also needs --clean flags or if run_benchmark handles it.
# For now, let's assume run_benchmark handles cleaning based on its own --clean flag.
# We'll forward other flags.
if key not in ["clean_reports", "clean_site"]:
cmd.append(flag_name)
# Handle the top-level --clean flag for run_benchmark
if clean:
# Pass clean flags to the stress test script as well, if needed
# This assumes test_stress_sdk.py also uses --clean-reports and --clean-site
cmd.append("--clean-reports")
cmd.append("--clean-site")
console.print("[yellow]Applying --clean: Cleaning reports and site before test.[/yellow]")
# Actual cleaning logic might reside here or be delegated entirely
console.print(f"\n[bold white]Running stress test:[/bold white] {' '.join(cmd)}")
start = time.time()
# Execute the stress test script
# Use Popen to stream output
try:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, encoding='utf-8', errors='replace')
while True:
line = proc.stdout.readline()
if not line:
break
console.print(line.rstrip()) # Print line by line
proc.wait() # Wait for the process to complete
except FileNotFoundError:
console.print(f"[bold red]Error: Script 'test_stress_sdk.py' not found. Make sure it's in the correct directory.[/bold red]")
return False
except Exception as e:
console.print(f"[bold red]Error running stress test subprocess: {e}[/bold red]")
return False
if proc.returncode != 0:
console.print(f"[bold red]Stress test failed with exit code {proc.returncode}[/bold red]")
return False
duration = time.time() - start
console.print(f"[bold green]Stress test completed in {duration:.1f} seconds[/bold green]")
# --- Report Generation (Optional) ---
if compare:
# Assuming benchmark_report.py exists and works with the generated reports
report_script = "benchmark_report.py" # Keep configurable if needed
report_cmd = ["python", report_script]
console.print(f"\n[bold white]Generating benchmark report: {' '.join(report_cmd)}[/bold white]")
# Run the report command and capture output
try:
report_proc = subprocess.run(report_cmd, capture_output=True, text=True, check=False, encoding='utf-8', errors='replace') # Use check=False to handle potential errors
# Print the captured output from benchmark_report.py
if report_proc.stdout:
console.print("\n" + report_proc.stdout)
if report_proc.stderr:
console.print("[yellow]Report generator stderr:[/yellow]\n" + report_proc.stderr)
if report_proc.returncode != 0:
console.print(f"[bold yellow]Benchmark report generation script '{report_script}' failed with exit code {report_proc.returncode}[/bold yellow]")
# Don't return False here, test itself succeeded
else:
console.print(f"[bold green]Benchmark report script '{report_script}' completed.[/bold green]")
# Find and print clickable links to the reports
# Assuming reports are saved in 'benchmark_reports' by benchmark_report.py
report_dir = "benchmark_reports"
if os.path.isdir(report_dir):
report_files = glob.glob(os.path.join(report_dir, "comparison_report_*.html"))
if report_files:
try:
latest_report = max(report_files, key=os.path.getctime)
report_path = os.path.abspath(latest_report)
report_url = pathlib.Path(report_path).as_uri() # Better way to create file URI
console.print(f"[bold cyan]Click to open report: [link={report_url}]{report_url}[/link][/bold cyan]")
except Exception as e:
console.print(f"[yellow]Could not determine latest report: {e}[/yellow]")
chart_files = glob.glob(os.path.join(report_dir, "memory_chart_*.png"))
if chart_files:
try:
latest_chart = max(chart_files, key=os.path.getctime)
chart_path = os.path.abspath(latest_chart)
chart_url = pathlib.Path(chart_path).as_uri()
console.print(f"[cyan]Memory chart: [link={chart_url}]{chart_url}[/link][/cyan]")
except Exception as e:
console.print(f"[yellow]Could not determine latest chart: {e}[/yellow]")
else:
console.print(f"[yellow]Benchmark report directory '{report_dir}' not found. Cannot link reports.[/yellow]")
except FileNotFoundError:
console.print(f"[bold red]Error: Report script '{report_script}' not found.[/bold red]")
except Exception as e:
console.print(f"[bold red]Error running report generation subprocess: {e}[/bold red]")
# Prompt to exit
console.print("\n[bold green]Benchmark run finished. Press Enter to exit.[/bold green]")
try:
input() # Wait for user input
except EOFError:
pass # Handle case where input is piped or unavailable
return True
def main():
parser = argparse.ArgumentParser(description="Run a Crawl4AI SDK benchmark test and generate a report")
# --- Arguments ---
parser.add_argument("config", choices=list(TEST_CONFIGS) + ["custom"],
help="Test configuration: quick, small, medium, large, extreme, or custom")
# Arguments for 'custom' config or to override presets
parser.add_argument("--urls", type=int, help="Number of URLs")
parser.add_argument("--max-sessions", type=int, help="Max concurrent sessions (replaces --workers)")
parser.add_argument("--chunk-size", type=int, help="URLs per batch (for non-stream logging)")
parser.add_argument("--port", type=int, help="HTTP server port")
parser.add_argument("--monitor-mode", type=str, choices=["DETAILED", "AGGREGATED"], help="Monitor display mode")
# Boolean flags / options
parser.add_argument("--stream", action="store_true", help="Enable streaming results (disables batch logging)")
parser.add_argument("--use-rate-limiter", action="store_true", help="Enable basic rate limiter")
parser.add_argument("--no-report", action="store_true", help="Skip generating comparison report")
parser.add_argument("--clean", action="store_true", help="Clean up reports and site before running")
parser.add_argument("--keep-server-alive", action="store_true", help="Keep HTTP server running after test")
parser.add_argument("--use-existing-site", action="store_true", help="Use existing site on specified port")
parser.add_argument("--skip-generation", action="store_true", help="Use existing site files without regenerating")
parser.add_argument("--keep-site", action="store_true", help="Keep generated site files after test")
# Removed url_level_logging as it's implicitly handled by stream/batch mode now
args = parser.parse_args()
custom_args = {}
# Populate custom_args from explicit command-line args
if args.urls is not None: custom_args["urls"] = args.urls
if args.max_sessions is not None: custom_args["max_sessions"] = args.max_sessions
if args.chunk_size is not None: custom_args["chunk_size"] = args.chunk_size
if args.port is not None: custom_args["port"] = args.port
if args.monitor_mode is not None: custom_args["monitor_mode"] = args.monitor_mode
if args.stream: custom_args["stream"] = True
if args.use_rate_limiter: custom_args["use_rate_limiter"] = True
if args.keep_server_alive: custom_args["keep_server_alive"] = True
if args.use_existing_site: custom_args["use_existing_site"] = True
if args.skip_generation: custom_args["skip_generation"] = True
if args.keep_site: custom_args["keep_site"] = True
# Clean flags are handled by the 'clean' argument passed to run_benchmark
# Validate custom config requirements
if args.config == "custom":
required_custom = ["urls", "max_sessions", "chunk_size"]
missing = [f"--{arg}" for arg in required_custom if arg not in custom_args]
if missing:
console.print(f"[bold red]Error: 'custom' config requires: {', '.join(missing)}[/bold red]")
return 1
success = run_benchmark(
config_name=args.config,
custom_args=custom_args, # Pass all collected custom args
compare=not args.no_report,
clean=args.clean
)
return 0 if success else 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,168 @@
"""
Test script for the CrawlerMonitor component.
This script simulates a crawler with multiple tasks to demonstrate the real-time monitoring capabilities.
"""
import time
import uuid
import random
import threading
import sys
import os
# Add the parent directory to the path to import crawl4ai
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
from crawl4ai.components.crawler_monitor import CrawlerMonitor
from crawl4ai.models import CrawlStatus
def simulate_crawler_task(monitor, task_id, url, simulate_failure=False):
"""Simulate a crawler task with different states."""
# Task starts in the QUEUED state
wait_time = random.uniform(0.5, 3.0)
time.sleep(wait_time)
# Update to IN_PROGRESS state
monitor.update_task(
task_id=task_id,
status=CrawlStatus.IN_PROGRESS,
start_time=time.time(),
wait_time=wait_time
)
# Simulate task running
process_time = random.uniform(1.0, 5.0)
for i in range(int(process_time * 2)):
# Simulate memory usage changes
memory_usage = random.uniform(5.0, 25.0)
monitor.update_task(
task_id=task_id,
memory_usage=memory_usage,
peak_memory=max(memory_usage, monitor.get_task_stats(task_id).get("peak_memory", 0))
)
time.sleep(0.5)
# Update to COMPLETED or FAILED state
if simulate_failure and random.random() < 0.8: # 80% chance of failure if simulate_failure is True
monitor.update_task(
task_id=task_id,
status=CrawlStatus.FAILED,
end_time=time.time(),
error_message="Simulated failure: Connection timeout",
memory_usage=0.0
)
else:
monitor.update_task(
task_id=task_id,
status=CrawlStatus.COMPLETED,
end_time=time.time(),
memory_usage=0.0
)
def update_queue_stats(monitor, num_queued_tasks):
"""Update queue statistics periodically."""
while monitor.is_running:
queued_tasks = [
task for task_id, task in monitor.get_all_task_stats().items()
if task["status"] == CrawlStatus.QUEUED.name
]
total_queued = len(queued_tasks)
if total_queued > 0:
current_time = time.time()
wait_times = [
current_time - task.get("enqueue_time", current_time)
for task in queued_tasks
]
highest_wait_time = max(wait_times) if wait_times else 0.0
avg_wait_time = sum(wait_times) / len(wait_times) if wait_times else 0.0
else:
highest_wait_time = 0.0
avg_wait_time = 0.0
monitor.update_queue_statistics(
total_queued=total_queued,
highest_wait_time=highest_wait_time,
avg_wait_time=avg_wait_time
)
# Simulate memory pressure based on number of active tasks
active_tasks = len([
task for task_id, task in monitor.get_all_task_stats().items()
if task["status"] == CrawlStatus.IN_PROGRESS.name
])
if active_tasks > 8:
monitor.update_memory_status("CRITICAL")
elif active_tasks > 4:
monitor.update_memory_status("PRESSURE")
else:
monitor.update_memory_status("NORMAL")
time.sleep(1.0)
def test_crawler_monitor():
"""Test the CrawlerMonitor with simulated crawler tasks."""
# Total number of URLs to crawl
total_urls = 50
# Initialize the monitor
monitor = CrawlerMonitor(urls_total=total_urls, refresh_rate=0.5)
# Start the monitor
monitor.start()
# Start thread to update queue statistics
queue_stats_thread = threading.Thread(target=update_queue_stats, args=(monitor, total_urls))
queue_stats_thread.daemon = True
queue_stats_thread.start()
try:
# Create task threads
threads = []
for i in range(total_urls):
task_id = str(uuid.uuid4())
url = f"https://example.com/page{i}"
# Add task to monitor
monitor.add_task(task_id, url)
# Determine if this task should simulate failure
simulate_failure = (i % 10 == 0) # Every 10th task
# Create and start thread for this task
thread = threading.Thread(
target=simulate_crawler_task,
args=(monitor, task_id, url, simulate_failure)
)
thread.daemon = True
threads.append(thread)
# Start threads with delay to simulate tasks being added over time
batch_size = 5
for i in range(0, len(threads), batch_size):
batch = threads[i:i+batch_size]
for thread in batch:
thread.start()
time.sleep(0.5) # Small delay between starting threads
# Wait a bit before starting the next batch
time.sleep(2.0)
# Wait for all threads to complete
for thread in threads:
thread.join()
# Keep monitor running a bit longer to see the final state
time.sleep(5.0)
except KeyboardInterrupt:
print("\nTest interrupted by user")
finally:
# Stop the monitor
monitor.stop()
print("\nCrawler monitor test completed")
if __name__ == "__main__":
test_crawler_monitor()

View File

@@ -0,0 +1,410 @@
import asyncio
import time
import psutil
import logging
import random
from typing import List, Dict
import uuid
import sys
import os
# Import your crawler components
from crawl4ai.models import DisplayMode, CrawlStatus, CrawlResult
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig, CacheMode
from crawl4ai import AsyncWebCrawler
from crawl4ai import MemoryAdaptiveDispatcher, CrawlerMonitor
# Global configuration
STREAM = False # Toggle between streaming and non-streaming modes
# Configure logging to file only (to avoid breaking the rich display)
os.makedirs("logs", exist_ok=True)
file_handler = logging.FileHandler("logs/memory_stress_test.log")
file_handler.setFormatter(logging.Formatter('%(asctime)s [%(levelname)s] %(message)s'))
# Root logger - only to file, not console
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
root_logger.addHandler(file_handler)
# Our test logger also writes to file only
logger = logging.getLogger("memory_stress_test")
logger.setLevel(logging.INFO)
logger.addHandler(file_handler)
logger.propagate = False # Don't propagate to root logger
# Create a memory restrictor to simulate limited memory environment
class MemorySimulator:
def __init__(self, target_percent: float = 85.0, aggressive: bool = False):
"""Simulates memory pressure by allocating memory"""
self.target_percent = target_percent
self.memory_blocks: List[bytearray] = []
self.aggressive = aggressive
def apply_pressure(self, additional_percent: float = 0.0):
"""Fill memory until we reach target percentage"""
current_percent = psutil.virtual_memory().percent
target = self.target_percent + additional_percent
if current_percent >= target:
return # Already at target
logger.info(f"Current memory: {current_percent}%, target: {target}%")
# Calculate how much memory we need to allocate
total_memory = psutil.virtual_memory().total
target_usage = (target / 100.0) * total_memory
current_usage = (current_percent / 100.0) * total_memory
bytes_to_allocate = int(target_usage - current_usage)
if bytes_to_allocate <= 0:
return
# Allocate in smaller chunks to avoid overallocation
if self.aggressive:
# Use larger chunks for faster allocation in aggressive mode
chunk_size = min(bytes_to_allocate, 200 * 1024 * 1024) # 200MB chunks
else:
chunk_size = min(bytes_to_allocate, 50 * 1024 * 1024) # 50MB chunks
try:
logger.info(f"Allocating {chunk_size / (1024 * 1024):.1f}MB to reach target memory usage")
self.memory_blocks.append(bytearray(chunk_size))
time.sleep(0.5) # Give system time to register the allocation
except MemoryError:
logger.warning("Unable to allocate more memory")
def release_pressure(self, percent: float = None):
"""
Release allocated memory
If percent is specified, release that percentage of blocks
"""
if not self.memory_blocks:
return
if percent is None:
# Release all
logger.info(f"Releasing all {len(self.memory_blocks)} memory blocks")
self.memory_blocks.clear()
else:
# Release specified percentage
blocks_to_release = int(len(self.memory_blocks) * (percent / 100.0))
if blocks_to_release > 0:
logger.info(f"Releasing {blocks_to_release} of {len(self.memory_blocks)} memory blocks ({percent}%)")
self.memory_blocks = self.memory_blocks[blocks_to_release:]
def spike_pressure(self, duration: float = 5.0):
"""
Create a temporary spike in memory pressure then release
Useful for forcing requeues
"""
logger.info(f"Creating memory pressure spike for {duration} seconds")
# Save current blocks count
initial_blocks = len(self.memory_blocks)
# Create spike with extra 5%
self.apply_pressure(additional_percent=5.0)
# Schedule release after duration
asyncio.create_task(self._delayed_release(duration, initial_blocks))
async def _delayed_release(self, delay: float, target_blocks: int):
"""Helper for spike_pressure - releases extra blocks after delay"""
await asyncio.sleep(delay)
# Remove blocks added since spike started
if len(self.memory_blocks) > target_blocks:
logger.info(f"Releasing memory spike ({len(self.memory_blocks) - target_blocks} blocks)")
self.memory_blocks = self.memory_blocks[:target_blocks]
# Test statistics collector
class TestResults:
def __init__(self):
self.start_time = time.time()
self.completed_urls: List[str] = []
self.failed_urls: List[str] = []
self.requeued_count = 0
self.memory_warnings = 0
self.max_memory_usage = 0.0
self.max_queue_size = 0
self.max_wait_time = 0.0
self.url_to_attempt: Dict[str, int] = {} # Track retries per URL
def log_summary(self):
duration = time.time() - self.start_time
logger.info("===== TEST SUMMARY =====")
logger.info(f"Stream mode: {'ON' if STREAM else 'OFF'}")
logger.info(f"Total duration: {duration:.1f} seconds")
logger.info(f"Completed URLs: {len(self.completed_urls)}")
logger.info(f"Failed URLs: {len(self.failed_urls)}")
logger.info(f"Requeue events: {self.requeued_count}")
logger.info(f"Memory warnings: {self.memory_warnings}")
logger.info(f"Max memory usage: {self.max_memory_usage:.1f}%")
logger.info(f"Max queue size: {self.max_queue_size}")
logger.info(f"Max wait time: {self.max_wait_time:.1f} seconds")
# Log URLs with multiple attempts
retried_urls = {url: count for url, count in self.url_to_attempt.items() if count > 1}
if retried_urls:
logger.info(f"URLs with retries: {len(retried_urls)}")
# Log the top 5 most retried
top_retries = sorted(retried_urls.items(), key=lambda x: x[1], reverse=True)[:5]
for url, count in top_retries:
logger.info(f" URL {url[-30:]} had {count} attempts")
# Write summary to a separate human-readable file
with open("logs/test_summary.txt", "w") as f:
f.write(f"Stream mode: {'ON' if STREAM else 'OFF'}\n")
f.write(f"Total duration: {duration:.1f} seconds\n")
f.write(f"Completed URLs: {len(self.completed_urls)}\n")
f.write(f"Failed URLs: {len(self.failed_urls)}\n")
f.write(f"Requeue events: {self.requeued_count}\n")
f.write(f"Memory warnings: {self.memory_warnings}\n")
f.write(f"Max memory usage: {self.max_memory_usage:.1f}%\n")
f.write(f"Max queue size: {self.max_queue_size}\n")
f.write(f"Max wait time: {self.max_wait_time:.1f} seconds\n")
# Custom monitor with stats tracking
# Custom monitor that extends CrawlerMonitor with test-specific tracking
class StressTestMonitor(CrawlerMonitor):
def __init__(self, test_results: TestResults, **kwargs):
# Initialize the parent CrawlerMonitor
super().__init__(**kwargs)
self.test_results = test_results
def update_memory_status(self, status: str):
if status != self.memory_status:
logger.info(f"Memory status changed: {self.memory_status} -> {status}")
if "CRITICAL" in status or "PRESSURE" in status:
self.test_results.memory_warnings += 1
# Track peak memory usage in test results
current_memory = psutil.virtual_memory().percent
self.test_results.max_memory_usage = max(self.test_results.max_memory_usage, current_memory)
# Call parent method to update the dashboard
super().update_memory_status(status)
def update_queue_statistics(self, total_queued: int, highest_wait_time: float, avg_wait_time: float):
# Track queue metrics in test results
self.test_results.max_queue_size = max(self.test_results.max_queue_size, total_queued)
self.test_results.max_wait_time = max(self.test_results.max_wait_time, highest_wait_time)
# Call parent method to update the dashboard
super().update_queue_statistics(total_queued, highest_wait_time, avg_wait_time)
def update_task(self, task_id: str, **kwargs):
# Track URL status changes for test results
if task_id in self.stats:
old_status = self.stats[task_id].status
# If this is a requeue event (requeued due to memory pressure)
if 'error_message' in kwargs and 'requeued' in kwargs['error_message']:
if not hasattr(self.stats[task_id], 'counted_requeue') or not self.stats[task_id].counted_requeue:
self.test_results.requeued_count += 1
self.stats[task_id].counted_requeue = True
# Track completion status for test results
if 'status' in kwargs:
new_status = kwargs['status']
if old_status != new_status:
if new_status == CrawlStatus.COMPLETED:
if task_id not in self.test_results.completed_urls:
self.test_results.completed_urls.append(task_id)
elif new_status == CrawlStatus.FAILED:
if task_id not in self.test_results.failed_urls:
self.test_results.failed_urls.append(task_id)
# Call parent method to update the dashboard
super().update_task(task_id, **kwargs)
self.live.update(self._create_table())
# Generate test URLs - use example.com with unique paths to avoid browser caching
def generate_test_urls(count: int) -> List[str]:
urls = []
for i in range(count):
# Add random path and query parameters to create unique URLs
path = f"/path/{uuid.uuid4()}"
query = f"?test={i}&random={random.randint(1, 100000)}"
urls.append(f"https://example.com{path}{query}")
return urls
# Process result callback
async def process_result(result, test_results: TestResults):
# Track attempt counts
if result.url not in test_results.url_to_attempt:
test_results.url_to_attempt[result.url] = 1
else:
test_results.url_to_attempt[result.url] += 1
if "requeued" in result.error_message:
test_results.requeued_count += 1
logger.debug(f"Requeued due to memory pressure: {result.url}")
elif result.success:
test_results.completed_urls.append(result.url)
logger.debug(f"Successfully processed: {result.url}")
else:
test_results.failed_urls.append(result.url)
logger.warning(f"Failed to process: {result.url} - {result.error_message}")
# Process multiple results (used in non-streaming mode)
async def process_results(results, test_results: TestResults):
for result in results:
await process_result(result, test_results)
# Main test function for extreme memory pressure simulation
async def run_memory_stress_test(
url_count: int = 100,
target_memory_percent: float = 92.0, # Push to dangerous levels
chunk_size: int = 20, # Larger chunks for more chaos
aggressive: bool = False,
spikes: bool = True
):
test_results = TestResults()
memory_simulator = MemorySimulator(target_percent=target_memory_percent, aggressive=aggressive)
logger.info(f"Starting stress test with {url_count} URLs in {'STREAM' if STREAM else 'NON-STREAM'} mode")
logger.info(f"Target memory usage: {target_memory_percent}%")
# First, elevate memory usage to create pressure
logger.info("Creating initial memory pressure...")
memory_simulator.apply_pressure()
# Create test URLs in chunks to simulate real-world crawling where URLs are discovered
all_urls = generate_test_urls(url_count)
url_chunks = [all_urls[i:i+chunk_size] for i in range(0, len(all_urls), chunk_size)]
# Set up the crawler components - low memory thresholds to create more requeues
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
verbose=False,
stream=STREAM # Use the global STREAM variable to set mode
)
# Create monitor with reference to test results
monitor = StressTestMonitor(
test_results=test_results,
display_mode=DisplayMode.DETAILED,
max_visible_rows=20,
total_urls=url_count # Pass total URLs count
)
# Create dispatcher with EXTREME settings - pure survival mode
# These settings are designed to create a memory battleground
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=63.0, # Start throttling at just 60% memory
critical_threshold_percent=70.0, # Start requeuing at 70% - incredibly aggressive
recovery_threshold_percent=55.0, # Only resume normal ops when plenty of memory available
check_interval=0.1, # Check extremely frequently (100ms)
max_session_permit=20 if aggressive else 10, # Double the concurrent sessions - pure chaos
fairness_timeout=10.0, # Extremely low timeout - rapid priority changes
monitor=monitor
)
# Set up spike schedule if enabled
if spikes:
spike_intervals = []
# Create 3-5 random spike times
num_spikes = random.randint(3, 5)
for _ in range(num_spikes):
# Schedule spikes at random chunks
chunk_index = random.randint(1, len(url_chunks) - 1)
spike_intervals.append(chunk_index)
logger.info(f"Scheduled memory spikes at chunks: {spike_intervals}")
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
# Process URLs in chunks to simulate discovering URLs over time
for chunk_index, url_chunk in enumerate(url_chunks):
logger.info(f"Processing chunk {chunk_index+1}/{len(url_chunks)} ({len(url_chunk)} URLs)")
# Regular pressure increases
if chunk_index % 2 == 0:
logger.info("Increasing memory pressure...")
memory_simulator.apply_pressure()
# Memory spike if scheduled for this chunk
if spikes and chunk_index in spike_intervals:
logger.info(f"⚠️ CREATING MASSIVE MEMORY SPIKE at chunk {chunk_index+1} ⚠️")
# Create a nightmare scenario - multiple overlapping spikes
memory_simulator.spike_pressure(duration=10.0) # 10-second spike
# 50% chance of double-spike (pure evil)
if random.random() < 0.5:
await asyncio.sleep(2.0) # Wait 2 seconds
logger.info("💀 DOUBLE SPIKE - EXTREME MEMORY PRESSURE 💀")
memory_simulator.spike_pressure(duration=8.0) # 8-second overlapping spike
if STREAM:
# Stream mode - process results as they come in
async for result in dispatcher.run_urls_stream(
urls=url_chunk,
crawler=crawler,
config=run_config
):
await process_result(result, test_results)
else:
# Non-stream mode - get all results at once
results = await dispatcher.run_urls(
urls=url_chunk,
crawler=crawler,
config=run_config
)
await process_results(results, test_results)
# Simulate discovering more URLs while others are still processing
await asyncio.sleep(1)
# RARELY release pressure - make the system fight for resources
if chunk_index % 5 == 4: # Less frequent releases
release_percent = random.choice([10, 15, 20]) # Smaller, inconsistent releases
logger.info(f"Releasing {release_percent}% of memory blocks - brief respite")
memory_simulator.release_pressure(percent=release_percent)
except Exception as e:
logger.error(f"Test error: {str(e)}")
raise
finally:
# Release memory pressure
memory_simulator.release_pressure()
# Log final results
test_results.log_summary()
# Check for success criteria
if len(test_results.completed_urls) + len(test_results.failed_urls) < url_count:
logger.error(f"TEST FAILED: Not all URLs were processed. {url_count - len(test_results.completed_urls) - len(test_results.failed_urls)} URLs missing.")
return False
logger.info("TEST PASSED: All URLs were processed without crashing.")
return True
# Command-line entry point
if __name__ == "__main__":
# Parse command line arguments
url_count = int(sys.argv[1]) if len(sys.argv) > 1 else 100
target_memory = float(sys.argv[2]) if len(sys.argv) > 2 else 85.0
# Check if stream mode is specified
if len(sys.argv) > 3:
STREAM = sys.argv[3].lower() in ('true', 'yes', '1', 'stream')
# Check if aggressive mode is specified
aggressive = False
if len(sys.argv) > 4:
aggressive = sys.argv[4].lower() in ('true', 'yes', '1', 'aggressive')
print(f"Starting test with {url_count} URLs, {target_memory}% memory target")
print(f"Stream mode: {STREAM}, Aggressive: {aggressive}")
print("Logs will be written to the logs directory")
print("Live display starting now...")
# Run the test
result = asyncio.run(run_memory_stress_test(
url_count=url_count,
target_memory_percent=target_memory,
aggressive=aggressive
))
# Exit with status code
sys.exit(0 if result else 1)

View File

@@ -0,0 +1,36 @@
#!/usr/bin/env python3
"""
Quick sanitycheck for /config/dump endpoint.
Usage:
python test_config_dump.py [http://localhost:8020]
If the server isnt running, start it first:
uvicorn deploy.docker.server:app --port 8020
"""
import sys, json, textwrap, requests
# BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
URL = f"{BASE.rstrip('/')}/config/dump"
CASES = [
# --- CrawlRunConfig variants ---
"CrawlerRunConfig()",
"CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)",
"CrawlerRunConfig(js_only=True, wait_until='networkidle')",
# --- BrowserConfig variants ---
"BrowserConfig()",
"BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
"BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')",
]
for code in CASES:
print("\n=== POST:", code)
resp = requests.post(URL, json={"code": code}, timeout=15)
if resp.ok:
print(json.dumps(resp.json(), indent=2)[:400] + "...")
else:
print("ERROR", resp.status_code, resp.text[:200])

View File

@@ -0,0 +1,520 @@
#!/usr/bin/env python3
"""
Stress test for Crawl4AI's Docker API server (/crawl and /crawl/stream endpoints).
This version targets a running Crawl4AI API server, sending concurrent requests
to test its ability to handle multiple crawl jobs simultaneously.
It uses httpx for async HTTP requests and logs results per batch of requests,
including server-side memory usage reported by the API.
"""
import asyncio
import time
import uuid
import argparse
import json
import sys
import os
import shutil
from typing import List, Dict, Optional, Union, AsyncGenerator, Tuple
import httpx
import pathlib # Import pathlib explicitly
from rich.console import Console
from rich.panel import Panel
from rich.syntax import Syntax
# --- Constants ---
DEFAULT_API_URL = "http://localhost:11235" # Default port
DEFAULT_API_URL = "http://localhost:8020" # Default port
DEFAULT_URL_COUNT = 100
DEFAULT_MAX_CONCURRENT_REQUESTS = 1
DEFAULT_CHUNK_SIZE = 10
DEFAULT_REPORT_PATH = "reports_api"
DEFAULT_STREAM_MODE = True
REQUEST_TIMEOUT = 180.0
# Initialize Rich console
console = Console()
# --- API Health Check (Unchanged) ---
async def check_server_health(client: httpx.AsyncClient, health_endpoint: str = "/health"):
"""Check if the API server is healthy."""
console.print(f"[bold cyan]Checking API server health at {client.base_url}{health_endpoint}...[/]", end="")
try:
response = await client.get(health_endpoint, timeout=10.0)
response.raise_for_status()
health_data = response.json()
version = health_data.get('version', 'N/A')
console.print(f"[bold green] Server OK! Version: {version}[/]")
return True
except (httpx.RequestError, httpx.HTTPStatusError) as e:
console.print(f"\n[bold red]Server health check FAILED:[/]")
console.print(f"Error: {e}")
console.print(f"Is the server running and accessible at {client.base_url}?")
return False
except Exception as e:
console.print(f"\n[bold red]An unexpected error occurred during health check:[/]")
console.print(e)
return False
# --- API Stress Test Class ---
class ApiStressTest:
"""Orchestrates the stress test by sending concurrent requests to the API."""
def __init__(
self,
api_url: str,
url_count: int,
max_concurrent_requests: int,
chunk_size: int,
report_path: str,
stream_mode: bool,
):
self.api_base_url = api_url.rstrip('/')
self.url_count = url_count
self.max_concurrent_requests = max_concurrent_requests
self.chunk_size = chunk_size
self.report_path = pathlib.Path(report_path)
self.report_path.mkdir(parents=True, exist_ok=True)
self.stream_mode = stream_mode
# Ignore repo path and set it to current file path
self.repo_path = pathlib.Path(__file__).parent.resolve()
self.test_id = time.strftime("%Y%m%d_%H%M%S")
self.results_summary = {
"test_id": self.test_id, "api_url": api_url, "url_count": url_count,
"max_concurrent_requests": max_concurrent_requests, "chunk_size": chunk_size,
"stream_mode": stream_mode, "start_time": "", "end_time": "",
"total_time_seconds": 0, "successful_requests": 0, "failed_requests": 0,
"successful_urls": 0, "failed_urls": 0, "total_urls_processed": 0,
"total_api_calls": 0,
"server_memory_metrics": { # To store aggregated server memory info
"batch_mode_avg_delta_mb": None,
"batch_mode_max_delta_mb": None,
"stream_mode_avg_max_snapshot_mb": None,
"stream_mode_max_max_snapshot_mb": None,
"samples": [] # Store individual request memory results
}
}
self.http_client = httpx.AsyncClient(base_url=self.api_base_url, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=max_concurrent_requests + 5, max_keepalive_connections=max_concurrent_requests))
async def close_client(self):
"""Close the httpx client."""
await self.http_client.aclose()
async def run(self) -> Dict:
"""Run the API stress test."""
# No client memory tracker needed
urls_to_process = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(self.url_count)]
url_chunks = [urls_to_process[i:i+self.chunk_size] for i in range(0, len(urls_to_process), self.chunk_size)]
self.results_summary["start_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
start_time = time.time()
console.print(f"\n[bold cyan]Crawl4AI API Stress Test - {self.url_count} URLs, {self.max_concurrent_requests} concurrent requests[/bold cyan]")
console.print(f"[bold cyan]Target API:[/bold cyan] {self.api_base_url}, [bold cyan]Mode:[/bold cyan] {'Streaming' if self.stream_mode else 'Batch'}, [bold cyan]URLs per Request:[/bold cyan] {self.chunk_size}")
# Removed client memory log
semaphore = asyncio.Semaphore(self.max_concurrent_requests)
# Updated Batch logging header
console.print("\n[bold]API Request Batch Progress:[/bold]")
# Adjusted spacing and added Peak
console.print("[bold] Batch | Progress | SrvMem Peak / Δ|Max (MB) | Reqs/sec | S/F URLs | Time (s) | Status [/bold]")
# Adjust separator length if needed, looks okay for now
console.print("" * 95)
# No client memory monitor task needed
tasks = []
total_api_calls = len(url_chunks)
self.results_summary["total_api_calls"] = total_api_calls
try:
for i, chunk in enumerate(url_chunks):
task = asyncio.create_task(self._make_api_request(
chunk=chunk,
batch_idx=i + 1,
total_batches=total_api_calls,
semaphore=semaphore
# No memory tracker passed
))
tasks.append(task)
api_results = await asyncio.gather(*tasks)
# Process aggregated results including server memory
total_successful_requests = sum(1 for r in api_results if r['request_success'])
total_failed_requests = total_api_calls - total_successful_requests
total_successful_urls = sum(r['success_urls'] for r in api_results)
total_failed_urls = sum(r['failed_urls'] for r in api_results)
total_urls_processed = total_successful_urls + total_failed_urls
# Aggregate server memory metrics
valid_samples = [r for r in api_results if r.get('server_delta_or_max_mb') is not None] # Filter results with valid mem data
self.results_summary["server_memory_metrics"]["samples"] = valid_samples # Store raw samples with both peak and delta/max
if valid_samples:
delta_or_max_values = [r['server_delta_or_max_mb'] for r in valid_samples]
if self.stream_mode:
# Stream mode: delta_or_max holds max snapshot
self.results_summary["server_memory_metrics"]["stream_mode_avg_max_snapshot_mb"] = sum(delta_or_max_values) / len(delta_or_max_values)
self.results_summary["server_memory_metrics"]["stream_mode_max_max_snapshot_mb"] = max(delta_or_max_values)
else: # Batch mode
# delta_or_max holds delta
self.results_summary["server_memory_metrics"]["batch_mode_avg_delta_mb"] = sum(delta_or_max_values) / len(delta_or_max_values)
self.results_summary["server_memory_metrics"]["batch_mode_max_delta_mb"] = max(delta_or_max_values)
# Aggregate peak values for batch mode
peak_values = [r['server_peak_memory_mb'] for r in valid_samples if r.get('server_peak_memory_mb') is not None]
if peak_values:
self.results_summary["server_memory_metrics"]["batch_mode_avg_peak_mb"] = sum(peak_values) / len(peak_values)
self.results_summary["server_memory_metrics"]["batch_mode_max_peak_mb"] = max(peak_values)
self.results_summary.update({
"successful_requests": total_successful_requests,
"failed_requests": total_failed_requests,
"successful_urls": total_successful_urls,
"failed_urls": total_failed_urls,
"total_urls_processed": total_urls_processed,
})
except Exception as e:
console.print(f"[bold red]An error occurred during task execution: {e}[/bold red]")
import traceback
traceback.print_exc()
# No finally block needed for monitor task
end_time = time.time()
self.results_summary.update({
"end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
"total_time_seconds": end_time - start_time,
# No client memory report
})
self._save_results()
return self.results_summary
async def _make_api_request(
self,
chunk: List[str],
batch_idx: int,
total_batches: int,
semaphore: asyncio.Semaphore
# No memory tracker
) -> Dict:
"""Makes a single API request for a chunk of URLs, handling concurrency and logging server memory."""
request_success = False
success_urls = 0
failed_urls = 0
status = "Pending"
status_color = "grey"
server_memory_metric = None # Store delta (batch) or max snapshot (stream)
api_call_start_time = time.time()
async with semaphore:
try:
# No client memory sampling
endpoint = "/crawl/stream" if self.stream_mode else "/crawl"
payload = {
"urls": chunk,
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS", "stream": self.stream_mode}
}
}
if self.stream_mode:
max_server_mem_snapshot = 0.0 # Track max memory seen in this stream
async with self.http_client.stream("POST", endpoint, json=payload) as response:
initial_status_code = response.status_code
response.raise_for_status()
completed_marker_received = False
async for line in response.aiter_lines():
if line:
try:
data = json.loads(line)
if data.get("status") == "completed":
completed_marker_received = True
break
elif data.get("url"):
if data.get("success"): success_urls += 1
else: failed_urls += 1
# Extract server memory snapshot per result
mem_snapshot = data.get('server_memory_mb')
if mem_snapshot is not None:
max_server_mem_snapshot = max(max_server_mem_snapshot, float(mem_snapshot))
except json.JSONDecodeError:
console.print(f"[Batch {batch_idx}] [red]Stream decode error for line:[/red] {line}")
failed_urls = len(chunk)
break
request_success = completed_marker_received
if not request_success:
failed_urls = len(chunk) - success_urls
server_memory_metric = max_server_mem_snapshot # Use max snapshot for stream logging
else: # Batch mode
response = await self.http_client.post(endpoint, json=payload)
response.raise_for_status()
data = response.json()
# Extract server memory delta from the response
server_memory_metric = data.get('server_memory_delta_mb')
server_peak_mem_mb = data.get('server_peak_memory_mb')
if data.get("success") and "results" in data:
request_success = True
results_list = data.get("results", [])
for result_item in results_list:
if result_item.get("success"): success_urls += 1
else: failed_urls += 1
if len(results_list) != len(chunk):
console.print(f"[Batch {batch_idx}] [yellow]Warning: Result count ({len(results_list)}) doesn't match URL count ({len(chunk)})[/yellow]")
failed_urls = len(chunk) - success_urls
else:
request_success = False
failed_urls = len(chunk)
# Try to get memory from error detail if available
detail = data.get('detail')
if isinstance(detail, str):
try: detail_json = json.loads(detail)
except: detail_json = {}
elif isinstance(detail, dict):
detail_json = detail
else: detail_json = {}
server_peak_mem_mb = detail_json.get('server_peak_memory_mb', None)
server_memory_metric = detail_json.get('server_memory_delta_mb', None)
console.print(f"[Batch {batch_idx}] [red]API request failed:[/red] {detail_json.get('error', 'No details')}")
except httpx.HTTPStatusError as e:
request_success = False
failed_urls = len(chunk)
console.print(f"[Batch {batch_idx}] [bold red]HTTP Error {e.response.status_code}:[/] {e.request.url}")
try:
error_detail = e.response.json()
# Attempt to extract memory info even from error responses
detail_content = error_detail.get('detail', {})
if isinstance(detail_content, str): # Handle if detail is stringified JSON
try: detail_content = json.loads(detail_content)
except: detail_content = {}
server_memory_metric = detail_content.get('server_memory_delta_mb', None)
server_peak_mem_mb = detail_content.get('server_peak_memory_mb', None)
console.print(f"Response: {error_detail}")
except Exception:
console.print(f"Response Text: {e.response.text[:200]}...")
except httpx.RequestError as e:
request_success = False
failed_urls = len(chunk)
console.print(f"[Batch {batch_idx}] [bold red]Request Error:[/bold] {e.request.url} - {e}")
except Exception as e:
request_success = False
failed_urls = len(chunk)
console.print(f"[Batch {batch_idx}] [bold red]Unexpected Error:[/bold] {e}")
import traceback
traceback.print_exc()
finally:
api_call_time = time.time() - api_call_start_time
total_processed_urls = success_urls + failed_urls
if request_success and failed_urls == 0: status_color, status = "green", "Success"
elif request_success and success_urls > 0: status_color, status = "yellow", "Partial"
else: status_color, status = "red", "Failed"
current_total_urls = batch_idx * self.chunk_size
progress_pct = min(100.0, (current_total_urls / self.url_count) * 100)
reqs_per_sec = 1.0 / api_call_time if api_call_time > 0 else float('inf')
# --- New Memory Formatting ---
mem_display = " N/A " # Default
peak_mem_value = None
delta_or_max_value = None
if self.stream_mode:
# server_memory_metric holds max snapshot for stream
if server_memory_metric is not None:
mem_display = f"{server_memory_metric:.1f} (Max)"
delta_or_max_value = server_memory_metric # Store for aggregation
else: # Batch mode - expect peak and delta
# We need to get peak and delta from the API response
peak_mem_value = locals().get('server_peak_mem_mb', None) # Get from response data if available
delta_value = server_memory_metric # server_memory_metric holds delta for batch
if peak_mem_value is not None and delta_value is not None:
mem_display = f"{peak_mem_value:.1f} / {delta_value:+.1f}"
delta_or_max_value = delta_value # Store delta for aggregation
elif peak_mem_value is not None:
mem_display = f"{peak_mem_value:.1f} / N/A"
elif delta_value is not None:
mem_display = f"N/A / {delta_value:+.1f}"
delta_or_max_value = delta_value # Store delta for aggregation
# --- Updated Print Statement with Adjusted Padding ---
console.print(
f" {batch_idx:<5} | {progress_pct:6.1f}% | {mem_display:>24} | {reqs_per_sec:8.1f} | " # Increased width for memory column
f"{success_urls:^7}/{failed_urls:<6} | {api_call_time:8.2f} | [{status_color}]{status:<7}[/{status_color}] " # Added trailing space
)
# --- Updated Return Dictionary ---
return_data = {
"batch_idx": batch_idx,
"request_success": request_success,
"success_urls": success_urls,
"failed_urls": failed_urls,
"time": api_call_time,
# Return both peak (if available) and delta/max
"server_peak_memory_mb": peak_mem_value, # Will be None for stream mode
"server_delta_or_max_mb": delta_or_max_value # Delta for batch, Max for stream
}
# Add back the specific batch mode delta if needed elsewhere, but delta_or_max covers it
# if not self.stream_mode:
# return_data["server_memory_delta_mb"] = delta_value
return return_data
# No _periodic_memory_sample needed
def _save_results(self) -> None:
"""Saves the results summary to a JSON file."""
results_path = self.report_path / f"api_test_summary_{self.test_id}.json"
try:
# No client memory path to convert
with open(results_path, 'w', encoding='utf-8') as f:
json.dump(self.results_summary, f, indent=2, default=str)
except Exception as e:
console.print(f"[bold red]Failed to save results summary: {e}[/bold red]")
# --- run_full_test Function ---
async def run_full_test(args):
"""Runs the full API stress test process."""
client = httpx.AsyncClient(base_url=args.api_url, timeout=REQUEST_TIMEOUT)
if not await check_server_health(client):
console.print("[bold red]Aborting test due to server health check failure.[/]")
await client.aclose()
return
await client.aclose()
test = ApiStressTest(
api_url=args.api_url,
url_count=args.urls,
max_concurrent_requests=args.max_concurrent_requests,
chunk_size=args.chunk_size,
report_path=args.report_path,
stream_mode=args.stream,
)
results = {}
try:
results = await test.run()
finally:
await test.close_client()
if not results:
console.print("[bold red]Test did not produce results.[/bold red]")
return
console.print("\n" + "=" * 80)
console.print("[bold green]API Stress Test Completed[/bold green]")
console.print("=" * 80)
success_rate_reqs = results["successful_requests"] / results["total_api_calls"] * 100 if results["total_api_calls"] > 0 else 0
success_rate_urls = results["successful_urls"] / results["url_count"] * 100 if results["url_count"] > 0 else 0
urls_per_second = results["total_urls_processed"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
reqs_per_second = results["total_api_calls"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
console.print(f"[bold cyan]Test ID:[/bold cyan] {results['test_id']}")
console.print(f"[bold cyan]Target API:[/bold cyan] {results['api_url']}")
console.print(f"[bold cyan]Configuration:[/bold cyan] {results['url_count']} URLs, {results['max_concurrent_requests']} concurrent client requests, URLs/Req: {results['chunk_size']}, Stream: {results['stream_mode']}")
console.print(f"[bold cyan]API Requests:[/bold cyan] {results['successful_requests']} successful, {results['failed_requests']} failed ({results['total_api_calls']} total, {success_rate_reqs:.1f}% success)")
console.print(f"[bold cyan]URL Processing:[/bold cyan] {results['successful_urls']} successful, {results['failed_urls']} failed ({results['total_urls_processed']} processed, {success_rate_urls:.1f}% success)")
console.print(f"[bold cyan]Performance:[/bold cyan] {results['total_time_seconds']:.2f}s total | Avg Reqs/sec: {reqs_per_second:.2f} | Avg URLs/sec: {urls_per_second:.2f}")
# Report Server Memory
mem_metrics = results.get("server_memory_metrics", {})
mem_samples = mem_metrics.get("samples", [])
if mem_samples:
num_samples = len(mem_samples)
if results['stream_mode']:
avg_mem = mem_metrics.get("stream_mode_avg_max_snapshot_mb")
max_mem = mem_metrics.get("stream_mode_max_max_snapshot_mb")
avg_str = f"{avg_mem:.1f}" if avg_mem is not None else "N/A"
max_str = f"{max_mem:.1f}" if max_mem is not None else "N/A"
console.print(f"[bold cyan]Server Memory (Stream):[/bold cyan] Avg Max Snapshot: {avg_str} MB | Max Max Snapshot: {max_str} MB (across {num_samples} requests)")
else: # Batch mode
avg_delta = mem_metrics.get("batch_mode_avg_delta_mb")
max_delta = mem_metrics.get("batch_mode_max_delta_mb")
avg_peak = mem_metrics.get("batch_mode_avg_peak_mb")
max_peak = mem_metrics.get("batch_mode_max_peak_mb")
avg_delta_str = f"{avg_delta:.1f}" if avg_delta is not None else "N/A"
max_delta_str = f"{max_delta:.1f}" if max_delta is not None else "N/A"
avg_peak_str = f"{avg_peak:.1f}" if avg_peak is not None else "N/A"
max_peak_str = f"{max_peak:.1f}" if max_peak is not None else "N/A"
console.print(f"[bold cyan]Server Memory (Batch):[/bold cyan] Avg Peak: {avg_peak_str} MB | Max Peak: {max_peak_str} MB | Avg Delta: {avg_delta_str} MB | Max Delta: {max_delta_str} MB (across {num_samples} requests)")
else:
console.print("[bold cyan]Server Memory:[/bold cyan] No memory data reported by server.")
# No client memory report
summary_path = pathlib.Path(args.report_path) / f"api_test_summary_{results['test_id']}.json"
console.print(f"[bold green]Results summary saved to {summary_path}[/bold green]")
if results["failed_requests"] > 0:
console.print(f"\n[bold yellow]Warning: {results['failed_requests']} API requests failed ({100-success_rate_reqs:.1f}% failure rate)[/bold yellow]")
if results["failed_urls"] > 0:
console.print(f"[bold yellow]Warning: {results['failed_urls']} URLs failed to process ({100-success_rate_urls:.1f}% URL failure rate)[/bold yellow]")
if results["total_urls_processed"] < results["url_count"]:
console.print(f"\n[bold red]Error: Only {results['total_urls_processed']} out of {results['url_count']} target URLs were processed![/bold red]")
# --- main Function (Argument parsing mostly unchanged) ---
def main():
"""Main entry point for the script."""
parser = argparse.ArgumentParser(description="Crawl4AI API Server Stress Test")
parser.add_argument("--api-url", type=str, default=DEFAULT_API_URL, help=f"Base URL of the Crawl4AI API server (default: {DEFAULT_API_URL})")
parser.add_argument("--urls", type=int, default=DEFAULT_URL_COUNT, help=f"Total number of unique URLs to process via API calls (default: {DEFAULT_URL_COUNT})")
parser.add_argument("--max-concurrent-requests", type=int, default=DEFAULT_MAX_CONCURRENT_REQUESTS, help=f"Maximum concurrent API requests from this client (default: {DEFAULT_MAX_CONCURRENT_REQUESTS})")
parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help=f"Number of URLs per API request payload (default: {DEFAULT_CHUNK_SIZE})")
parser.add_argument("--stream", action="store_true", default=DEFAULT_STREAM_MODE, help=f"Use the /crawl/stream endpoint instead of /crawl (default: {DEFAULT_STREAM_MODE})")
parser.add_argument("--report-path", type=str, default=DEFAULT_REPORT_PATH, help=f"Path to save reports and logs (default: {DEFAULT_REPORT_PATH})")
parser.add_argument("--clean-reports", action="store_true", help="Clean up report directory before running")
args = parser.parse_args()
console.print("[bold underline]Crawl4AI API Stress Test Configuration[/bold underline]")
console.print(f"API URL: {args.api_url}")
console.print(f"Total URLs: {args.urls}, Concurrent Client Requests: {args.max_concurrent_requests}, URLs per Request: {args.chunk_size}")
console.print(f"Mode: {'Streaming' if args.stream else 'Batch'}")
console.print(f"Report Path: {args.report_path}")
console.print("-" * 40)
if args.clean_reports: console.print("[cyan]Option: Clean reports before test[/cyan]")
console.print("-" * 40)
if args.clean_reports:
report_dir = pathlib.Path(args.report_path)
if report_dir.exists():
console.print(f"[yellow]Cleaning up reports directory: {args.report_path}[/yellow]")
shutil.rmtree(args.report_path)
report_dir.mkdir(parents=True, exist_ok=True)
try:
asyncio.run(run_full_test(args))
except KeyboardInterrupt:
console.print("\n[bold yellow]Test interrupted by user.[/bold yellow]")
except Exception as e:
console.print(f"\n[bold red]An unexpected error occurred:[/bold red] {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
# No need to modify sys.path for SimpleMemoryTracker as it's removed
main()

View File

@@ -0,0 +1,203 @@
"""Lite Crawl4AI API stresstester.
✔ batch or stream mode (single unified path)
✔ global stats + JSON summary
✔ rich table progress
✔ Typer CLI with presets (quick / soak)
Usage examples:
python api_stress_test.py # uses quick preset
python api_stress_test.py soak # 5K URLs stress run
python api_stress_test.py --urls 200 --concurrent 10 --chunk 20
"""
from __future__ import annotations
import asyncio, json, time, uuid, pathlib, statistics
from typing import List, Dict, Optional
import httpx, typer
from rich.console import Console
from rich.table import Table
# ───────────────────────── defaults / presets ──────────────────────────
PRESETS = {
"quick": dict(urls=1, concurrent=1, chunk=1, stream=False),
"debug": dict(urls=10, concurrent=2, chunk=5, stream=False),
"soak": dict(urls=5000, concurrent=20, chunk=50, stream=True),
}
API_HEALTH_ENDPOINT = "/health"
REQUEST_TIMEOUT = 180.0
console = Console()
app = typer.Typer(add_completion=False, rich_markup_mode="rich")
# ───────────────────────── helpers ─────────────────────────────────────
async def _check_health(client: httpx.AsyncClient) -> None:
resp = await client.get(API_HEALTH_ENDPOINT, timeout=10)
resp.raise_for_status()
console.print(f"[green]Server healthy — version {resp.json().get('version','?')}[/]")
async def _iter_results(resp: httpx.Response, stream: bool):
"""Yield result dicts from batch JSON or NDJSON stream."""
if stream:
async for line in resp.aiter_lines():
if not line:
continue
rec = json.loads(line)
if rec.get("status") == "completed":
break
yield rec
else:
data = resp.json()
for rec in data.get("results", []):
yield rec, data # rec + whole payload for memory delta/peak
async def _consume_stream(resp: httpx.Response) -> Dict:
stats = {"success_urls": 0, "failed_urls": 0, "mem_metric": 0.0}
async for line in resp.aiter_lines():
if not line:
continue
rec = json.loads(line)
if rec.get("status") == "completed":
break
if rec.get("success"):
stats["success_urls"] += 1
else:
stats["failed_urls"] += 1
mem = rec.get("server_memory_mb")
if mem is not None:
stats["mem_metric"] = max(stats["mem_metric"], float(mem))
return stats
def _consume_batch(body: Dict) -> Dict:
stats = {"success_urls": 0, "failed_urls": 0}
for rec in body.get("results", []):
if rec.get("success"):
stats["success_urls"] += 1
else:
stats["failed_urls"] += 1
stats["mem_metric"] = body.get("server_memory_delta_mb")
stats["peak"] = body.get("server_peak_memory_mb")
return stats
async def _fetch_chunk(
client: httpx.AsyncClient,
urls: List[str],
stream: bool,
semaphore: asyncio.Semaphore,
) -> Dict:
endpoint = "/crawl/stream" if stream else "/crawl"
payload = {
"urls": urls,
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS", "stream": stream}},
}
async with semaphore:
start = time.perf_counter()
if stream:
# ---- streaming request ----
async with client.stream("POST", endpoint, json=payload) as resp:
resp.raise_for_status()
stats = await _consume_stream(resp)
else:
# ---- batch request ----
resp = await client.post(endpoint, json=payload)
resp.raise_for_status()
stats = _consume_batch(resp.json())
stats["elapsed"] = time.perf_counter() - start
return stats
# ───────────────────────── core runner ─────────────────────────────────
async def _run(api: str, urls: int, concurrent: int, chunk: int, stream: bool, report: pathlib.Path):
client = httpx.AsyncClient(base_url=api, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=concurrent+5))
await _check_health(client)
url_list = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(urls)]
chunks = [url_list[i:i+chunk] for i in range(0, len(url_list), chunk)]
sem = asyncio.Semaphore(concurrent)
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Batch", style="dim", width=6)
table.add_column("Success/Fail", width=12)
table.add_column("Mem", width=14)
table.add_column("Time (s)")
agg_success = agg_fail = 0
deltas, peaks = [], []
start = time.perf_counter()
tasks = [asyncio.create_task(_fetch_chunk(client, c, stream, sem)) for c in chunks]
for idx, coro in enumerate(asyncio.as_completed(tasks), 1):
res = await coro
agg_success += res["success_urls"]
agg_fail += res["failed_urls"]
if res["mem_metric"] is not None:
deltas.append(res["mem_metric"])
if res["peak"] is not None:
peaks.append(res["peak"])
mem_txt = f"{res['mem_metric']:.1f}" if res["mem_metric"] is not None else ""
if res["peak"] is not None:
mem_txt = f"{res['peak']:.1f}/{mem_txt}"
table.add_row(str(idx), f"{res['success_urls']}/{res['failed_urls']}", mem_txt, f"{res['elapsed']:.2f}")
console.print(table)
total_time = time.perf_counter() - start
summary = {
"urls": urls,
"concurrent": concurrent,
"chunk": chunk,
"stream": stream,
"success_urls": agg_success,
"failed_urls": agg_fail,
"elapsed_sec": round(total_time, 2),
"avg_mem": round(statistics.mean(deltas), 2) if deltas else None,
"max_mem": max(deltas) if deltas else None,
"avg_peak": round(statistics.mean(peaks), 2) if peaks else None,
"max_peak": max(peaks) if peaks else None,
}
console.print("\n[bold green]Done:[/]" , summary)
report.mkdir(parents=True, exist_ok=True)
path = report / f"api_test_{int(time.time())}.json"
path.write_text(json.dumps(summary, indent=2))
console.print(f"[green]Summary → {path}")
await client.aclose()
# ───────────────────────── Typer CLI ──────────────────────────────────
@app.command()
def main(
preset: str = typer.Argument("quick", help="quick / debug / soak or custom"),
api_url: str = typer.Option("http://localhost:8020", show_default=True),
urls: int = typer.Option(None, help="Total URLs to crawl"),
concurrent: int = typer.Option(None, help="Concurrent API requests"),
chunk: int = typer.Option(None, help="URLs per request"),
stream: bool = typer.Option(None, help="Use /crawl/stream"),
report: pathlib.Path = typer.Option("reports_api", help="Where to save JSON summary"),
):
"""Run a stress test against a running Crawl4AI API server."""
if preset not in PRESETS and any(v is None for v in (urls, concurrent, chunk, stream)):
console.print(f"[red]Unknown preset '{preset}' and custom params missing[/]")
raise typer.Exit(1)
cfg = PRESETS.get(preset, {})
urls = urls or cfg.get("urls")
concurrent = concurrent or cfg.get("concurrent")
chunk = chunk or cfg.get("chunk")
stream = stream if stream is not None else cfg.get("stream", False)
console.print(f"[cyan]API:[/] {api_url} | URLs: {urls} | Concurrency: {concurrent} | Chunk: {chunk} | Stream: {stream}")
asyncio.run(_run(api_url, urls, concurrent, chunk, stream, report))
if __name__ == "__main__":
app()

View File

@@ -0,0 +1,129 @@
"""
Crawl4AI Docker API stress tester.
Examples
--------
python test_stress_docker_api.py --urls 1000 --concurrency 32
python test_stress_docker_api.py --urls 1000 --concurrency 32 --stream
python test_stress_docker_api.py --base-url http://10.0.0.42:11235 --http2
"""
import argparse, asyncio, json, secrets, statistics, time
from typing import List, Tuple
import httpx
from rich.console import Console
from rich.progress import Progress, BarColumn, TimeElapsedColumn, TimeRemainingColumn
from rich.table import Table
console = Console()
# ───────────────────────── helpers ─────────────────────────
def make_fake_urls(n: int) -> List[str]:
base = "https://httpbin.org/anything/"
return [f"{base}{secrets.token_hex(8)}" for _ in range(n)]
async def fire(
client: httpx.AsyncClient, endpoint: str, payload: dict, sem: asyncio.Semaphore
) -> Tuple[bool, float]:
async with sem:
print(f"POST {endpoint} with {len(payload['urls'])} URLs")
t0 = time.perf_counter()
try:
if endpoint.endswith("/stream"):
async with client.stream("POST", endpoint, json=payload) as r:
r.raise_for_status()
async for _ in r.aiter_lines():
pass
else:
r = await client.post(endpoint, json=payload)
r.raise_for_status()
return True, time.perf_counter() - t0
except Exception:
return False, time.perf_counter() - t0
def pct(lat: List[float], p: float) -> str:
"""Return percentile string even for tiny samples."""
if not lat:
return "-"
if len(lat) == 1:
return f"{lat[0]:.2f}s"
lat_sorted = sorted(lat)
k = (p / 100) * (len(lat_sorted) - 1)
lo = int(k)
hi = min(lo + 1, len(lat_sorted) - 1)
frac = k - lo
val = lat_sorted[lo] * (1 - frac) + lat_sorted[hi] * frac
return f"{val:.2f}s"
# ───────────────────────── main ─────────────────────────
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="Stress test Crawl4AI Docker API")
p.add_argument("--urls", type=int, default=100, help="number of URLs")
p.add_argument("--concurrency", type=int, default=1, help="max POSTs in flight")
p.add_argument("--chunk-size", type=int, default=50, help="URLs per request")
p.add_argument("--base-url", default="http://localhost:11235", help="API root")
# p.add_argument("--base-url", default="http://localhost:8020", help="API root")
p.add_argument("--stream", action="store_true", help="use /crawl/stream")
p.add_argument("--http2", action="store_true", help="enable HTTP/2")
p.add_argument("--headless", action="store_true", default=True)
return p.parse_args()
async def main() -> None:
args = parse_args()
urls = make_fake_urls(args.urls)
batches = [urls[i : i + args.chunk_size] for i in range(0, len(urls), args.chunk_size)]
endpoint = "/crawl/stream" if args.stream else "/crawl"
sem = asyncio.Semaphore(args.concurrency)
async with httpx.AsyncClient(base_url=args.base_url, http2=args.http2, timeout=None) as client:
with Progress(
"[progress.description]{task.description}",
BarColumn(),
"[progress.percentage]{task.percentage:>3.0f}%",
TimeElapsedColumn(),
TimeRemainingColumn(),
) as progress:
task_id = progress.add_task("[cyan]bombarding…", total=len(batches))
tasks = []
for chunk in batches:
payload = {
"urls": chunk,
"browser_config": {"type": "BrowserConfig", "params": {"headless": args.headless}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS", "stream": args.stream}},
}
tasks.append(asyncio.create_task(fire(client, endpoint, payload, sem)))
progress.advance(task_id)
results = await asyncio.gather(*tasks)
ok_latencies = [dt for ok, dt in results if ok]
err_count = sum(1 for ok, _ in results if not ok)
table = Table(title="Docker API StressTest Summary")
table.add_column("total", justify="right")
table.add_column("errors", justify="right")
table.add_column("p50", justify="right")
table.add_column("p95", justify="right")
table.add_column("max", justify="right")
table.add_row(
str(len(results)),
str(err_count),
pct(ok_latencies, 50),
pct(ok_latencies, 95),
f"{max(ok_latencies):.2f}s" if ok_latencies else "-",
)
console.print(table)
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
console.print("\n[yellow]aborted by user[/]")

View File

@@ -0,0 +1,500 @@
#!/usr/bin/env python3
"""
Stress test for Crawl4AI's arun_many and dispatcher system.
This version uses a local HTTP server and focuses on testing
the SDK's ability to handle multiple URLs concurrently, with per-batch logging.
"""
import asyncio
import os
import time
import pathlib
import random
import secrets
import argparse
import json
import sys
import subprocess
import signal
from typing import List, Dict, Optional, Union, AsyncGenerator
import shutil
from rich.console import Console
# Crawl4AI components
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
BrowserConfig,
MemoryAdaptiveDispatcher,
CrawlerMonitor,
DisplayMode,
CrawlResult,
RateLimiter,
CacheMode,
)
# Constants
DEFAULT_SITE_PATH = "test_site"
DEFAULT_PORT = 8000
DEFAULT_MAX_SESSIONS = 16
DEFAULT_URL_COUNT = 1
DEFAULT_CHUNK_SIZE = 1 # Define chunk size for batch logging
DEFAULT_REPORT_PATH = "reports"
DEFAULT_STREAM_MODE = False
DEFAULT_MONITOR_MODE = "DETAILED"
# Initialize Rich console
console = Console()
# --- SiteGenerator Class (Unchanged) ---
class SiteGenerator:
"""Generates a local test site with heavy pages for stress testing."""
def __init__(self, site_path: str = DEFAULT_SITE_PATH, page_count: int = DEFAULT_URL_COUNT):
self.site_path = pathlib.Path(site_path)
self.page_count = page_count
self.images_dir = self.site_path / "images"
self.lorem_words = " ".join("lorem ipsum dolor sit amet " * 100).split()
self.html_template = """<!doctype html>
<html>
<head>
<title>Test Page {page_num}</title>
<meta charset="utf-8">
</head>
<body>
<h1>Test Page {page_num}</h1>
{paragraphs}
{images}
</body>
</html>
"""
def generate_site(self) -> None:
self.site_path.mkdir(parents=True, exist_ok=True)
self.images_dir.mkdir(exist_ok=True)
console.print(f"Generating {self.page_count} test pages...")
for i in range(self.page_count):
paragraphs = "\n".join(f"<p>{' '.join(random.choices(self.lorem_words, k=200))}</p>" for _ in range(5))
images = "\n".join(f'<img src="https://picsum.photos/seed/{secrets.token_hex(8)}/300/200" loading="lazy" alt="Random image {j}"/>' for j in range(3))
page_path = self.site_path / f"page_{i}.html"
page_path.write_text(self.html_template.format(page_num=i, paragraphs=paragraphs, images=images), encoding="utf-8")
if (i + 1) % (self.page_count // 10 or 1) == 0 or i == self.page_count - 1:
console.print(f"Generated {i+1}/{self.page_count} pages")
self._create_index_page()
console.print(f"[bold green]Successfully generated {self.page_count} test pages in [cyan]{self.site_path}[/cyan][/bold green]")
def _create_index_page(self) -> None:
index_content = """<!doctype html><html><head><title>Test Site Index</title><meta charset="utf-8"></head><body><h1>Test Site Index</h1><p>This is an automatically generated site for testing Crawl4AI.</p><div class="page-links">\n"""
for i in range(self.page_count):
index_content += f' <a href="page_{i}.html">Test Page {i}</a><br>\n'
index_content += """ </div></body></html>"""
(self.site_path / "index.html").write_text(index_content, encoding="utf-8")
# --- LocalHttpServer Class (Unchanged) ---
class LocalHttpServer:
"""Manages a local HTTP server for serving test pages."""
def __init__(self, site_path: str = DEFAULT_SITE_PATH, port: int = DEFAULT_PORT):
self.site_path = pathlib.Path(site_path)
self.port = port
self.process = None
def start(self) -> None:
if not self.site_path.exists(): raise FileNotFoundError(f"Site directory {self.site_path} does not exist")
console.print(f"Attempting to start HTTP server in [cyan]{self.site_path}[/cyan] on port {self.port}...")
try:
cmd = ["python", "-m", "http.server", str(self.port)]
creationflags = 0; preexec_fn = None
if sys.platform == 'win32': creationflags = subprocess.CREATE_NEW_PROCESS_GROUP
self.process = subprocess.Popen(cmd, cwd=str(self.site_path), stdout=subprocess.PIPE, stderr=subprocess.PIPE, creationflags=creationflags)
time.sleep(1.5)
if self.is_running(): console.print(f"[bold green]HTTP server started successfully (PID: {self.process.pid})[/bold green]")
else:
console.print("[bold red]Failed to start HTTP server. Checking logs...[/bold red]")
stdout, stderr = self.process.communicate(); print(stdout.decode(errors='ignore')); print(stderr.decode(errors='ignore'))
self.stop(); raise RuntimeError("HTTP server failed to start.")
except Exception as e: console.print(f"[bold red]Error starting HTTP server: {str(e)}[/bold red]"); self.stop(); raise
def stop(self) -> None:
if self.process and self.is_running():
console.print(f"Stopping HTTP server (PID: {self.process.pid})...")
try:
if sys.platform == 'win32': self.process.send_signal(signal.CTRL_BREAK_EVENT); time.sleep(0.5)
self.process.terminate()
try: stdout, stderr = self.process.communicate(timeout=5); console.print("[bold yellow]HTTP server stopped[/bold yellow]")
except subprocess.TimeoutExpired: console.print("[bold red]Server did not terminate gracefully, killing...[/bold red]"); self.process.kill(); stdout, stderr = self.process.communicate(); console.print("[bold yellow]HTTP server killed[/bold yellow]")
except Exception as e: console.print(f"[bold red]Error stopping HTTP server: {str(e)}[/bold red]"); self.process.kill()
finally: self.process = None
elif self.process: console.print("[dim]HTTP server process already stopped.[/dim]"); self.process = None
def is_running(self) -> bool:
if not self.process: return False
return self.process.poll() is None
# --- SimpleMemoryTracker Class (Unchanged) ---
class SimpleMemoryTracker:
"""Basic memory tracker that doesn't rely on psutil."""
def __init__(self, report_path: str = DEFAULT_REPORT_PATH, test_id: Optional[str] = None):
self.report_path = pathlib.Path(report_path); self.report_path.mkdir(parents=True, exist_ok=True)
self.test_id = test_id or time.strftime("%Y%m%d_%H%M%S")
self.start_time = time.time(); self.memory_samples = []; self.pid = os.getpid()
self.csv_path = self.report_path / f"memory_samples_{self.test_id}.csv"
with open(self.csv_path, 'w', encoding='utf-8') as f: f.write("timestamp,elapsed_seconds,memory_info_mb\n")
def sample(self) -> Dict:
try:
memory_mb = self._get_memory_info_mb()
memory_str = f"{memory_mb:.1f} MB" if memory_mb is not None else "Unknown"
timestamp = time.time(); elapsed = timestamp - self.start_time
sample = {"timestamp": timestamp, "elapsed_seconds": elapsed, "memory_mb": memory_mb, "memory_str": memory_str}
self.memory_samples.append(sample)
with open(self.csv_path, 'a', encoding='utf-8') as f: f.write(f"{timestamp},{elapsed:.2f},{memory_mb if memory_mb is not None else ''}\n")
return sample
except Exception as e: return {"memory_mb": None, "memory_str": "Error"}
def _get_memory_info_mb(self) -> Optional[float]:
pid_str = str(self.pid)
try:
if sys.platform == 'darwin': result = subprocess.run(["ps", "-o", "rss=", "-p", pid_str], capture_output=True, text=True, check=True, encoding='utf-8'); return int(result.stdout.strip()) / 1024.0
elif sys.platform == 'linux':
with open(f"/proc/{pid_str}/status", encoding='utf-8') as f:
for line in f:
if line.startswith("VmRSS:"): return int(line.split()[1]) / 1024.0
return None
elif sys.platform == 'win32': result = subprocess.run(["tasklist", "/fi", f"PID eq {pid_str}", "/fo", "csv", "/nh"], capture_output=True, text=True, check=True, encoding='cp850', errors='ignore'); parts = result.stdout.strip().split('","'); return int(parts[4].strip().replace('"', '').replace(' K', '').replace(',', '')) / 1024.0 if len(parts) >= 5 else None
else: return None
except: return None # Catch all exceptions for robustness
def get_report(self) -> Dict:
if not self.memory_samples: return {"error": "No memory samples collected"}
total_time = time.time() - self.start_time; valid_samples = [s['memory_mb'] for s in self.memory_samples if s['memory_mb'] is not None]
start_mem = valid_samples[0] if valid_samples else None; end_mem = valid_samples[-1] if valid_samples else None
max_mem = max(valid_samples) if valid_samples else None; avg_mem = sum(valid_samples) / len(valid_samples) if valid_samples else None
growth = (end_mem - start_mem) if start_mem is not None and end_mem is not None else None
return {"test_id": self.test_id, "total_time_seconds": total_time, "sample_count": len(self.memory_samples), "valid_sample_count": len(valid_samples), "csv_path": str(self.csv_path), "platform": sys.platform, "start_memory_mb": start_mem, "end_memory_mb": end_mem, "max_memory_mb": max_mem, "average_memory_mb": avg_mem, "memory_growth_mb": growth}
# --- CrawlerStressTest Class (Refactored for Per-Batch Logging) ---
class CrawlerStressTest:
"""Orchestrates the stress test using arun_many per chunk and a dispatcher."""
def __init__(
self,
url_count: int = DEFAULT_URL_COUNT,
port: int = DEFAULT_PORT,
max_sessions: int = DEFAULT_MAX_SESSIONS,
chunk_size: int = DEFAULT_CHUNK_SIZE, # Added chunk_size
report_path: str = DEFAULT_REPORT_PATH,
stream_mode: bool = DEFAULT_STREAM_MODE,
monitor_mode: str = DEFAULT_MONITOR_MODE,
use_rate_limiter: bool = False
):
self.url_count = url_count
self.server_port = port
self.max_sessions = max_sessions
self.chunk_size = chunk_size # Store chunk size
self.report_path = pathlib.Path(report_path)
self.report_path.mkdir(parents=True, exist_ok=True)
self.stream_mode = stream_mode
self.monitor_mode = DisplayMode[monitor_mode.upper()]
self.use_rate_limiter = use_rate_limiter
self.test_id = time.strftime("%Y%m%d_%H%M%S")
self.results_summary = {
"test_id": self.test_id, "url_count": url_count, "max_sessions": max_sessions,
"chunk_size": chunk_size, "stream_mode": stream_mode, "monitor_mode": monitor_mode,
"rate_limiter_used": use_rate_limiter, "start_time": "", "end_time": "",
"total_time_seconds": 0, "successful_urls": 0, "failed_urls": 0,
"urls_processed": 0, "chunks_processed": 0
}
async def run(self) -> Dict:
"""Run the stress test and return results."""
memory_tracker = SimpleMemoryTracker(report_path=self.report_path, test_id=self.test_id)
urls = [f"http://localhost:{self.server_port}/page_{i}.html" for i in range(self.url_count)]
# Split URLs into chunks based on self.chunk_size
url_chunks = [urls[i:i+self.chunk_size] for i in range(0, len(urls), self.chunk_size)]
self.results_summary["start_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
start_time = time.time()
config = CrawlerRunConfig(
wait_for_images=False, verbose=False,
stream=self.stream_mode, # Still pass stream mode, affects arun_many return type
cache_mode=CacheMode.BYPASS
)
total_successful_urls = 0
total_failed_urls = 0
total_urls_processed = 0
start_memory_sample = memory_tracker.sample()
start_memory_str = start_memory_sample.get("memory_str", "Unknown")
# monitor = CrawlerMonitor(display_mode=self.monitor_mode, total_urls=self.url_count)
monitor = None
rate_limiter = RateLimiter(base_delay=(0.1, 0.3)) if self.use_rate_limiter else None
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=self.max_sessions, monitor=monitor, rate_limiter=rate_limiter)
console.print(f"\n[bold cyan]Crawl4AI Stress Test - {self.url_count} URLs, {self.max_sessions} max sessions[/bold cyan]")
console.print(f"[bold cyan]Mode:[/bold cyan] {'Streaming' if self.stream_mode else 'Batch'}, [bold cyan]Monitor:[/bold cyan] {self.monitor_mode.name}, [bold cyan]Chunk Size:[/bold cyan] {self.chunk_size}")
console.print(f"[bold cyan]Initial Memory:[/bold cyan] {start_memory_str}")
# Print batch log header only if not streaming
if not self.stream_mode:
console.print("\n[bold]Batch Progress:[/bold] (Monitor below shows overall progress)")
console.print("[bold] Batch | Progress | Start Mem | End Mem | URLs/sec | Success/Fail | Time (s) | Status [/bold]")
console.print("" * 90)
monitor_task = asyncio.create_task(self._periodic_memory_sample(memory_tracker, 2.0))
try:
async with AsyncWebCrawler(
config=BrowserConfig( verbose = False)
) as crawler:
# Process URLs chunk by chunk
for chunk_idx, url_chunk in enumerate(url_chunks):
batch_start_time = time.time()
chunk_success = 0
chunk_failed = 0
# Sample memory before the chunk
start_mem_sample = memory_tracker.sample()
start_mem_str = start_mem_sample.get("memory_str", "Unknown")
# --- Call arun_many for the current chunk ---
try:
# Note: dispatcher/monitor persist across calls
results_gen_or_list: Union[AsyncGenerator[CrawlResult, None], List[CrawlResult]] = \
await crawler.arun_many(
urls=url_chunk,
config=config,
dispatcher=dispatcher # Reuse the same dispatcher
)
if self.stream_mode:
# Process stream results if needed, but batch logging is less relevant
async for result in results_gen_or_list:
total_urls_processed += 1
if result.success: chunk_success += 1
else: chunk_failed += 1
# In stream mode, batch summary isn't as meaningful here
# We could potentially track completion per chunk async, but it's complex
else: # Batch mode
# Process the list of results for this chunk
for result in results_gen_or_list:
total_urls_processed += 1
if result.success: chunk_success += 1
else: chunk_failed += 1
except Exception as e:
console.print(f"[bold red]Error processing chunk {chunk_idx+1}: {e}[/bold red]")
chunk_failed = len(url_chunk) # Assume all failed in the chunk on error
total_urls_processed += len(url_chunk) # Count them as processed (failed)
# --- Log batch results (only if not streaming) ---
if not self.stream_mode:
batch_time = time.time() - batch_start_time
urls_per_sec = len(url_chunk) / batch_time if batch_time > 0 else 0
end_mem_sample = memory_tracker.sample()
end_mem_str = end_mem_sample.get("memory_str", "Unknown")
progress_pct = (total_urls_processed / self.url_count) * 100
if chunk_failed == 0: status_color, status = "green", "Success"
elif chunk_success == 0: status_color, status = "red", "Failed"
else: status_color, status = "yellow", "Partial"
console.print(
f" {chunk_idx+1:<5} | {progress_pct:6.1f}% | {start_mem_str:>9} | {end_mem_str:>9} | {urls_per_sec:8.1f} | "
f"{chunk_success:^7}/{chunk_failed:<6} | {batch_time:8.2f} | [{status_color}]{status:<7}[/{status_color}]"
)
# Accumulate totals
total_successful_urls += chunk_success
total_failed_urls += chunk_failed
self.results_summary["chunks_processed"] += 1
# Optional small delay between starting chunks if needed
# await asyncio.sleep(0.1)
except Exception as e:
console.print(f"[bold red]An error occurred during the main crawl loop: {e}[/bold red]")
finally:
if 'monitor_task' in locals() and not monitor_task.done():
monitor_task.cancel()
try: await monitor_task
except asyncio.CancelledError: pass
end_time = time.time()
self.results_summary.update({
"end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
"total_time_seconds": end_time - start_time,
"successful_urls": total_successful_urls,
"failed_urls": total_failed_urls,
"urls_processed": total_urls_processed,
"memory": memory_tracker.get_report()
})
self._save_results()
return self.results_summary
async def _periodic_memory_sample(self, tracker: SimpleMemoryTracker, interval: float):
"""Background task to sample memory periodically."""
while True:
tracker.sample()
try:
await asyncio.sleep(interval)
except asyncio.CancelledError:
break # Exit loop on cancellation
def _save_results(self) -> None:
results_path = self.report_path / f"test_summary_{self.test_id}.json"
try:
with open(results_path, 'w', encoding='utf-8') as f: json.dump(self.results_summary, f, indent=2, default=str)
# console.print(f"\n[bold green]Results summary saved to {results_path}[/bold green]") # Moved summary print to run_full_test
except Exception as e: console.print(f"[bold red]Failed to save results summary: {e}[/bold red]")
# --- run_full_test Function (Adjusted) ---
async def run_full_test(args):
"""Run the complete test process from site generation to crawling."""
server = None
site_generated = False
# --- Site Generation --- (Same as before)
if not args.use_existing_site and not args.skip_generation:
if os.path.exists(args.site_path): console.print(f"[yellow]Removing existing site directory: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
site_generator = SiteGenerator(site_path=args.site_path, page_count=args.urls); site_generator.generate_site(); site_generated = True
elif args.use_existing_site: console.print(f"[cyan]Using existing site assumed to be running on port {args.port}[/cyan]")
elif args.skip_generation:
console.print(f"[cyan]Skipping site generation, using existing directory: {args.site_path}[/cyan]")
if not os.path.exists(args.site_path) or not os.path.isdir(args.site_path): console.print(f"[bold red]Error: Site path '{args.site_path}' does not exist or is not a directory.[/bold red]"); return
# --- Start Local Server --- (Same as before)
server_started = False
if not args.use_existing_site:
server = LocalHttpServer(site_path=args.site_path, port=args.port)
try: server.start(); server_started = True
except Exception as e:
console.print(f"[bold red]Failed to start local server. Aborting test.[/bold red]")
if site_generated and not args.keep_site: console.print(f"[yellow]Cleaning up generated site: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
return
try:
# --- Run the Stress Test ---
test = CrawlerStressTest(
url_count=args.urls,
port=args.port,
max_sessions=args.max_sessions,
chunk_size=args.chunk_size, # Pass chunk_size
report_path=args.report_path,
stream_mode=args.stream,
monitor_mode=args.monitor_mode,
use_rate_limiter=args.use_rate_limiter
)
results = await test.run() # Run the test which now handles chunks internally
# --- Print Summary ---
console.print("\n" + "=" * 80)
console.print("[bold green]Test Completed[/bold green]")
console.print("=" * 80)
# (Summary printing logic remains largely the same)
success_rate = results["successful_urls"] / results["url_count"] * 100 if results["url_count"] > 0 else 0
urls_per_second = results["urls_processed"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
console.print(f"[bold cyan]Test ID:[/bold cyan] {results['test_id']}")
console.print(f"[bold cyan]Configuration:[/bold cyan] {results['url_count']} URLs, {results['max_sessions']} sessions, Chunk: {results['chunk_size']}, Stream: {results['stream_mode']}, Monitor: {results['monitor_mode']}")
console.print(f"[bold cyan]Results:[/bold cyan] {results['successful_urls']} successful, {results['failed_urls']} failed ({results['urls_processed']} processed, {success_rate:.1f}% success)")
console.print(f"[bold cyan]Performance:[/bold cyan] {results['total_time_seconds']:.2f} seconds total, {urls_per_second:.2f} URLs/second avg")
mem_report = results.get("memory", {})
mem_info_str = "Memory tracking data unavailable."
if mem_report and not mem_report.get("error"):
start_mb = mem_report.get('start_memory_mb'); end_mb = mem_report.get('end_memory_mb'); max_mb = mem_report.get('max_memory_mb'); growth_mb = mem_report.get('memory_growth_mb')
mem_parts = []
if start_mb is not None: mem_parts.append(f"Start: {start_mb:.1f} MB")
if end_mb is not None: mem_parts.append(f"End: {end_mb:.1f} MB")
if max_mb is not None: mem_parts.append(f"Max: {max_mb:.1f} MB")
if growth_mb is not None: mem_parts.append(f"Growth: {growth_mb:.1f} MB")
if mem_parts: mem_info_str = ", ".join(mem_parts)
csv_path = mem_report.get('csv_path')
if csv_path: console.print(f"[dim]Memory samples saved to: {csv_path}[/dim]")
console.print(f"[bold cyan]Memory Usage:[/bold cyan] {mem_info_str}")
console.print(f"[bold green]Results summary saved to {results['memory']['csv_path'].replace('memory_samples', 'test_summary').replace('.csv', '.json')}[/bold green]") # Infer summary path
if results["failed_urls"] > 0: console.print(f"\n[bold yellow]Warning: {results['failed_urls']} URLs failed to process ({100-success_rate:.1f}% failure rate)[/bold yellow]")
if results["urls_processed"] < results["url_count"]: console.print(f"\n[bold red]Error: Only {results['urls_processed']} out of {results['url_count']} URLs were processed![/bold red]")
finally:
# --- Stop Server / Cleanup --- (Same as before)
if server_started and server and not args.keep_server_alive: server.stop()
elif server_started and server and args.keep_server_alive:
console.print(f"[bold cyan]Server is kept running on port {args.port}. Press Ctrl+C to stop it.[/bold cyan]")
try: await asyncio.Future() # Keep running indefinitely
except KeyboardInterrupt: console.print("\n[bold yellow]Stopping server due to user interrupt...[/bold yellow]"); server.stop()
if site_generated and not args.keep_site: console.print(f"[yellow]Cleaning up generated site: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
elif args.clean_site and os.path.exists(args.site_path): console.print(f"[yellow]Cleaning up site directory as requested: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
# --- main Function (Added chunk_size argument) ---
def main():
"""Main entry point for the script."""
parser = argparse.ArgumentParser(description="Crawl4AI SDK High Volume Stress Test using arun_many")
# Test parameters
parser.add_argument("--urls", type=int, default=DEFAULT_URL_COUNT, help=f"Number of URLs to test (default: {DEFAULT_URL_COUNT})")
parser.add_argument("--max-sessions", type=int, default=DEFAULT_MAX_SESSIONS, help=f"Maximum concurrent crawling sessions (default: {DEFAULT_MAX_SESSIONS})")
parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help=f"Number of URLs per batch for logging (default: {DEFAULT_CHUNK_SIZE})") # Added
parser.add_argument("--stream", action="store_true", default=DEFAULT_STREAM_MODE, help=f"Enable streaming mode (disables batch logging) (default: {DEFAULT_STREAM_MODE})")
parser.add_argument("--monitor-mode", type=str, default=DEFAULT_MONITOR_MODE, choices=["DETAILED", "AGGREGATED"], help=f"Display mode for the live monitor (default: {DEFAULT_MONITOR_MODE})")
parser.add_argument("--use-rate-limiter", action="store_true", default=False, help="Enable a basic rate limiter (default: False)")
# Environment parameters
parser.add_argument("--site-path", type=str, default=DEFAULT_SITE_PATH, help=f"Path to generate/use the test site (default: {DEFAULT_SITE_PATH})")
parser.add_argument("--port", type=int, default=DEFAULT_PORT, help=f"Port for the local HTTP server (default: {DEFAULT_PORT})")
parser.add_argument("--report-path", type=str, default=DEFAULT_REPORT_PATH, help=f"Path to save reports and logs (default: {DEFAULT_REPORT_PATH})")
# Site/Server management
parser.add_argument("--skip-generation", action="store_true", help="Use existing test site folder without regenerating")
parser.add_argument("--use-existing-site", action="store_true", help="Do not generate site or start local server; assume site exists on --port")
parser.add_argument("--keep-server-alive", action="store_true", help="Keep the local HTTP server running after test")
parser.add_argument("--keep-site", action="store_true", help="Keep the generated test site files after test")
parser.add_argument("--clean-reports", action="store_true", help="Clean up report directory before running")
parser.add_argument("--clean-site", action="store_true", help="Clean up site directory before running (if generating) or after")
args = parser.parse_args()
# Display config
console.print("[bold underline]Crawl4AI SDK Stress Test Configuration[/bold underline]")
console.print(f"URLs: {args.urls}, Max Sessions: {args.max_sessions}, Chunk Size: {args.chunk_size}") # Added chunk size
console.print(f"Mode: {'Streaming' if args.stream else 'Batch'}, Monitor: {args.monitor_mode}, Rate Limit: {args.use_rate_limiter}")
console.print(f"Site Path: {args.site_path}, Port: {args.port}, Report Path: {args.report_path}")
console.print("-" * 40)
# (Rest of config display and cleanup logic is the same)
if args.use_existing_site: console.print("[cyan]Mode: Using existing external site/server[/cyan]")
elif args.skip_generation: console.print("[cyan]Mode: Using existing site files, starting local server[/cyan]")
else: console.print("[cyan]Mode: Generating site files, starting local server[/cyan]")
if args.keep_server_alive: console.print("[cyan]Option: Keep server alive after test[/cyan]")
if args.keep_site: console.print("[cyan]Option: Keep site files after test[/cyan]")
if args.clean_reports: console.print("[cyan]Option: Clean reports before test[/cyan]")
if args.clean_site: console.print("[cyan]Option: Clean site directory[/cyan]")
console.print("-" * 40)
if args.clean_reports:
if os.path.exists(args.report_path): console.print(f"[yellow]Cleaning up reports directory: {args.report_path}[/yellow]"); shutil.rmtree(args.report_path)
os.makedirs(args.report_path, exist_ok=True)
if args.clean_site and not args.use_existing_site:
if os.path.exists(args.site_path): console.print(f"[yellow]Cleaning up site directory as requested: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
# Run
try: asyncio.run(run_full_test(args))
except KeyboardInterrupt: console.print("\n[bold yellow]Test interrupted by user.[/bold yellow]")
except Exception as e: console.print(f"\n[bold red]An unexpected error occurred:[/bold red] {e}"); import traceback; traceback.print_exc()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,32 @@
from crawl4ai import BrowserProfiler
import asyncio
if __name__ == "__main__":
# Example usage
profiler = BrowserProfiler()
# Create a new profile
import os
from pathlib import Path
home_dir = Path.home()
profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
print(f"Profile created at: {profile_path}")
# # Launch a standalone browser
# asyncio.run(profiler.launch_standalone_browser())
# # List profiles
# profiles = profiler.list_profiles()
# for profile in profiles:
# print(f"Profile: {profile['name']}, Path: {profile['path']}")
# # Delete a profile
# success = profiler.delete_profile("my-profile")
# if success:
# print("Profile deleted successfully")
# else:
# print("Failed to delete profile")

View File

@@ -19,7 +19,7 @@ async def main():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com", config=config)
print(f"Success: {result.success}")
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
print(f"Markdown length: {len(result.markdown.raw_markdown)}")
if __name__ == "__main__":

View File

@@ -1,4 +1,5 @@
import unittest, os
from crawl4ai import LLMConfig
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import (
RegexChunking,
@@ -42,7 +43,7 @@ class TestWebCrawler(unittest.TestCase):
word_count_threshold=5,
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")
llm_config=LLMConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY"))
),
bypass_cache=True,
)