Merge branch 'main' into fix-raw-url-parsing
This commit is contained in:
@@ -7,6 +7,7 @@ import json
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
from crawl4ai.chunking_strategy import RegexChunking
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
@@ -48,8 +49,7 @@ async def test_llm_extraction_strategy():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://www.nbcnews.com/business"
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract only content related to technology",
|
||||
)
|
||||
result = await crawler.arun(
|
||||
|
||||
4
tests/browser/docker/__init__.py
Normal file
4
tests/browser/docker/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
"""Docker browser strategy tests.
|
||||
|
||||
This package contains tests for the Docker browser strategy implementation.
|
||||
"""
|
||||
651
tests/browser/docker/test_docker_browser.py
Normal file
651
tests/browser/docker/test_docker_browser.py
Normal file
@@ -0,0 +1,651 @@
|
||||
"""Test examples for Docker Browser Strategy.
|
||||
|
||||
These examples demonstrate the functionality of Docker Browser Strategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import uuid
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
from crawl4ai.browser import DockerConfig
|
||||
from crawl4ai.browser import DockerRegistry
|
||||
from crawl4ai.browser import DockerUtils
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
# Global Docker utils instance
|
||||
docker_utils = DockerUtils(logger)
|
||||
|
||||
async def test_docker_components():
|
||||
"""Test Docker utilities, registry, and image building.
|
||||
|
||||
This function tests the core Docker components before running the browser tests.
|
||||
It validates DockerRegistry, DockerUtils, and builds test images to ensure
|
||||
everything is functioning correctly.
|
||||
"""
|
||||
logger.info("Testing Docker components", tag="SETUP")
|
||||
|
||||
# Create a test registry directory
|
||||
registry_dir = os.path.join(os.path.dirname(__file__), "test_registry")
|
||||
registry_file = os.path.join(registry_dir, "test_registry.json")
|
||||
os.makedirs(registry_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
# 1. Test DockerRegistry
|
||||
logger.info("Testing DockerRegistry...", tag="SETUP")
|
||||
registry = DockerRegistry(registry_file)
|
||||
|
||||
# Test saving and loading registry
|
||||
test_container_id = "test-container-123"
|
||||
registry.register_container(test_container_id, 9876, "test-hash-123")
|
||||
registry.save()
|
||||
|
||||
# Create a new registry instance that loads from the file
|
||||
registry2 = DockerRegistry(registry_file)
|
||||
port = registry2.get_container_host_port(test_container_id)
|
||||
hash_value = registry2.get_container_config_hash(test_container_id)
|
||||
|
||||
if port != 9876 or hash_value != "test-hash-123":
|
||||
logger.error("DockerRegistry persistence failed", tag="SETUP")
|
||||
return False
|
||||
|
||||
# Clean up test container from registry
|
||||
registry2.unregister_container(test_container_id)
|
||||
logger.success("DockerRegistry works correctly", tag="SETUP")
|
||||
|
||||
# 2. Test DockerUtils
|
||||
logger.info("Testing DockerUtils...", tag="SETUP")
|
||||
|
||||
# Test port detection
|
||||
in_use = docker_utils.is_port_in_use(22) # SSH port is usually in use
|
||||
logger.info(f"Port 22 in use: {in_use}", tag="SETUP")
|
||||
|
||||
# Get next available port
|
||||
available_port = docker_utils.get_next_available_port(9000)
|
||||
logger.info(f"Next available port: {available_port}", tag="SETUP")
|
||||
|
||||
# Test config hash generation
|
||||
config_dict = {"mode": "connect", "headless": True}
|
||||
config_hash = docker_utils.generate_config_hash(config_dict)
|
||||
logger.info(f"Generated config hash: {config_hash[:8]}...", tag="SETUP")
|
||||
|
||||
# 3. Test Docker is available
|
||||
logger.info("Checking Docker availability...", tag="SETUP")
|
||||
if not await check_docker_available():
|
||||
logger.error("Docker is not available - cannot continue tests", tag="SETUP")
|
||||
return False
|
||||
|
||||
# 4. Test building connect image
|
||||
logger.info("Building connect mode Docker image...", tag="SETUP")
|
||||
connect_image = await docker_utils.ensure_docker_image_exists(None, "connect")
|
||||
if not connect_image:
|
||||
logger.error("Failed to build connect mode image", tag="SETUP")
|
||||
return False
|
||||
logger.success(f"Successfully built connect image: {connect_image}", tag="SETUP")
|
||||
|
||||
# 5. Test building launch image
|
||||
logger.info("Building launch mode Docker image...", tag="SETUP")
|
||||
launch_image = await docker_utils.ensure_docker_image_exists(None, "launch")
|
||||
if not launch_image:
|
||||
logger.error("Failed to build launch mode image", tag="SETUP")
|
||||
return False
|
||||
logger.success(f"Successfully built launch image: {launch_image}", tag="SETUP")
|
||||
|
||||
# 6. Test creating and removing container
|
||||
logger.info("Testing container creation and removal...", tag="SETUP")
|
||||
container_id = await docker_utils.create_container(
|
||||
image_name=launch_image,
|
||||
host_port=available_port,
|
||||
container_name="crawl4ai-test-container"
|
||||
)
|
||||
|
||||
if not container_id:
|
||||
logger.error("Failed to create test container", tag="SETUP")
|
||||
return False
|
||||
|
||||
logger.info(f"Created test container: {container_id[:12]}", tag="SETUP")
|
||||
|
||||
# Verify container is running
|
||||
running = await docker_utils.is_container_running(container_id)
|
||||
if not running:
|
||||
logger.error("Test container is not running", tag="SETUP")
|
||||
await docker_utils.remove_container(container_id)
|
||||
return False
|
||||
|
||||
# Test commands in container
|
||||
logger.info("Testing command execution in container...", tag="SETUP")
|
||||
returncode, stdout, stderr = await docker_utils.exec_in_container(
|
||||
container_id, ["ls", "-la", "/"]
|
||||
)
|
||||
|
||||
if returncode != 0:
|
||||
logger.error(f"Command execution failed: {stderr}", tag="SETUP")
|
||||
await docker_utils.remove_container(container_id)
|
||||
return False
|
||||
|
||||
# Verify Chrome is installed in the container
|
||||
returncode, stdout, stderr = await docker_utils.exec_in_container(
|
||||
container_id, ["which", "chromium"]
|
||||
)
|
||||
|
||||
if returncode != 0:
|
||||
logger.error("Chrome not found in container", tag="SETUP")
|
||||
await docker_utils.remove_container(container_id)
|
||||
return False
|
||||
|
||||
chrome_path = stdout.strip()
|
||||
logger.info(f"Chrome found at: {chrome_path}", tag="SETUP")
|
||||
|
||||
# Test Chrome version
|
||||
returncode, stdout, stderr = await docker_utils.exec_in_container(
|
||||
container_id, ["chromium", "--version"]
|
||||
)
|
||||
|
||||
if returncode != 0:
|
||||
logger.error(f"Failed to get Chrome version: {stderr}", tag="SETUP")
|
||||
await docker_utils.remove_container(container_id)
|
||||
return False
|
||||
|
||||
logger.info(f"Chrome version: {stdout.strip()}", tag="SETUP")
|
||||
|
||||
# Remove test container
|
||||
removed = await docker_utils.remove_container(container_id)
|
||||
if not removed:
|
||||
logger.error("Failed to remove test container", tag="SETUP")
|
||||
return False
|
||||
|
||||
logger.success("Test container removed successfully", tag="SETUP")
|
||||
|
||||
# All components tested successfully
|
||||
logger.success("All Docker components tested successfully", tag="SETUP")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Docker component tests failed: {str(e)}", tag="SETUP")
|
||||
return False
|
||||
finally:
|
||||
# Clean up registry test directory
|
||||
if os.path.exists(registry_dir):
|
||||
shutil.rmtree(registry_dir)
|
||||
|
||||
async def test_docker_connect_mode():
|
||||
"""Test Docker browser in connect mode.
|
||||
|
||||
This tests the basic functionality of creating a browser in Docker
|
||||
connect mode and using it for navigation.
|
||||
"""
|
||||
logger.info("Testing Docker browser in connect mode", tag="TEST")
|
||||
|
||||
# Create temp directory for user data
|
||||
temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Create Docker configuration
|
||||
docker_config = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=False,
|
||||
remove_on_exit=True,
|
||||
user_data_dir=temp_dir
|
||||
)
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config
|
||||
)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
|
||||
# Get a page
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Got page successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
finally:
|
||||
# Clean up the temp directory
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
async def test_docker_launch_mode():
|
||||
"""Test Docker browser in launch mode.
|
||||
|
||||
This tests launching a Chrome browser within a Docker container
|
||||
on demand with custom settings.
|
||||
"""
|
||||
logger.info("Testing Docker browser in launch mode", tag="TEST")
|
||||
|
||||
# Create temp directory for user data
|
||||
temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data_launch")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Create Docker configuration
|
||||
docker_config = DockerConfig(
|
||||
mode="launch",
|
||||
persistent=False,
|
||||
remove_on_exit=True,
|
||||
user_data_dir=temp_dir
|
||||
)
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
text_mode=True, # Enable text mode for faster operation
|
||||
docker_config=docker_config
|
||||
)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
|
||||
# Get a page
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Got page successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
finally:
|
||||
# Clean up the temp directory
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
async def test_docker_persistent_storage():
|
||||
"""Test Docker browser with persistent storage.
|
||||
|
||||
This tests creating localStorage data in one session and verifying
|
||||
it persists to another session when using persistent storage.
|
||||
"""
|
||||
logger.info("Testing Docker browser with persistent storage", tag="TEST")
|
||||
|
||||
# Create a unique temp directory
|
||||
test_id = uuid.uuid4().hex[:8]
|
||||
temp_dir = os.path.join(os.path.dirname(__file__), f"tmp_user_data_persist_{test_id}")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
manager1 = None
|
||||
manager2 = None
|
||||
|
||||
try:
|
||||
# Create Docker configuration with persistence
|
||||
docker_config = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=True, # Keep container running between sessions
|
||||
user_data_dir=temp_dir,
|
||||
container_user_data_dir="/data"
|
||||
)
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config
|
||||
)
|
||||
|
||||
# Create first browser manager
|
||||
manager1 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager1.start()
|
||||
logger.info("First browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig()
|
||||
|
||||
# Get a page
|
||||
page1, context1 = await manager1.get_page(crawler_config)
|
||||
|
||||
# Navigate to example.com
|
||||
await page1.goto("https://example.com")
|
||||
|
||||
# Set localStorage item
|
||||
test_value = f"test_value_{test_id}"
|
||||
await page1.evaluate(f"localStorage.setItem('test_key', '{test_value}')")
|
||||
logger.info(f"Set localStorage test_key = {test_value}", tag="TEST")
|
||||
|
||||
# Close the first browser manager
|
||||
await manager1.close()
|
||||
logger.info("First browser closed", tag="TEST")
|
||||
|
||||
# Create second browser manager with same config
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager2.start()
|
||||
logger.info("Second browser started successfully", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
|
||||
# Navigate to same site
|
||||
await page2.goto("https://example.com")
|
||||
|
||||
# Get localStorage item
|
||||
value = await page2.evaluate("localStorage.getItem('test_key')")
|
||||
logger.info(f"Retrieved localStorage test_key = {value}", tag="TEST")
|
||||
|
||||
# Check if persistence worked
|
||||
if value == test_value:
|
||||
logger.success("Storage persistence verified!", tag="TEST")
|
||||
else:
|
||||
logger.error(f"Storage persistence failed! Expected {test_value}, got {value}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager2.close()
|
||||
logger.info("Second browser closed successfully", tag="TEST")
|
||||
|
||||
return value == test_value
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
if manager1:
|
||||
await manager1.close()
|
||||
if manager2:
|
||||
await manager2.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
finally:
|
||||
# Clean up the temp directory
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
async def test_docker_parallel_pages():
|
||||
"""Test Docker browser with parallel page creation.
|
||||
|
||||
This tests the ability to create and use multiple pages in parallel
|
||||
from a single Docker browser instance.
|
||||
"""
|
||||
logger.info("Testing Docker browser with parallel pages", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create Docker configuration
|
||||
docker_config = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=False,
|
||||
remove_on_exit=True
|
||||
)
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config
|
||||
)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig()
|
||||
|
||||
# Get multiple pages
|
||||
page_count = 3
|
||||
pages = await manager.get_pages(crawler_config, count=page_count)
|
||||
logger.info(f"Got {len(pages)} pages successfully", tag="TEST")
|
||||
|
||||
if len(pages) != page_count:
|
||||
logger.error(f"Expected {page_count} pages, got {len(pages)}", tag="TEST")
|
||||
await manager.close()
|
||||
return False
|
||||
|
||||
# Navigate to different sites with each page
|
||||
tasks = []
|
||||
for i, (page, _) in enumerate(pages):
|
||||
tasks.append(page.goto(f"https://example.com?page={i}"))
|
||||
|
||||
# Wait for all navigations to complete
|
||||
await asyncio.gather(*tasks)
|
||||
logger.info("All pages navigated successfully", tag="TEST")
|
||||
|
||||
# Get titles from all pages
|
||||
titles = []
|
||||
for i, (page, _) in enumerate(pages):
|
||||
title = await page.title()
|
||||
titles.append(title)
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_docker_registry_reuse():
|
||||
"""Test Docker container reuse via registry.
|
||||
|
||||
This tests that containers with matching configurations
|
||||
are reused rather than creating new ones.
|
||||
"""
|
||||
logger.info("Testing Docker container reuse via registry", tag="TEST")
|
||||
|
||||
# Create registry for this test
|
||||
registry_dir = os.path.join(os.path.dirname(__file__), "registry_reuse_test")
|
||||
registry_file = os.path.join(registry_dir, "registry.json")
|
||||
os.makedirs(registry_dir, exist_ok=True)
|
||||
|
||||
manager1 = None
|
||||
manager2 = None
|
||||
container_id1 = None
|
||||
|
||||
try:
|
||||
# Create identical Docker configurations with custom registry
|
||||
docker_config1 = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=True, # Keep container running after closing
|
||||
registry_file=registry_file
|
||||
)
|
||||
|
||||
# Create first browser configuration
|
||||
browser_config1 = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config1
|
||||
)
|
||||
|
||||
# Create first browser manager
|
||||
manager1 = BrowserManager(browser_config=browser_config1, logger=logger)
|
||||
|
||||
# Start the first browser
|
||||
await manager1.start()
|
||||
logger.info("First browser started successfully", tag="TEST")
|
||||
|
||||
# Get container ID from the strategy
|
||||
docker_strategy1 = manager1.strategy
|
||||
container_id1 = docker_strategy1.container_id
|
||||
logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST")
|
||||
|
||||
# Close the first manager but keep container running
|
||||
await manager1.close()
|
||||
logger.info("First browser closed", tag="TEST")
|
||||
|
||||
# Create second Docker configuration identical to first
|
||||
docker_config2 = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=True,
|
||||
registry_file=registry_file
|
||||
)
|
||||
|
||||
# Create second browser configuration
|
||||
browser_config2 = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config2
|
||||
)
|
||||
|
||||
# Create second browser manager
|
||||
manager2 = BrowserManager(browser_config=browser_config2, logger=logger)
|
||||
|
||||
# Start the second browser - should reuse existing container
|
||||
await manager2.start()
|
||||
logger.info("Second browser started successfully", tag="TEST")
|
||||
|
||||
# Get container ID from the second strategy
|
||||
docker_strategy2 = manager2.strategy
|
||||
container_id2 = docker_strategy2.container_id
|
||||
logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST")
|
||||
|
||||
# Verify container reuse
|
||||
if container_id1 == container_id2:
|
||||
logger.success("Container reuse successful - using same container!", tag="TEST")
|
||||
else:
|
||||
logger.error("Container reuse failed - new container created!", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
docker_strategy2.docker_config.persistent = False
|
||||
docker_strategy2.docker_config.remove_on_exit = True
|
||||
await manager2.close()
|
||||
logger.info("Second browser closed and container removed", tag="TEST")
|
||||
|
||||
return container_id1 == container_id2
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
if manager1:
|
||||
await manager1.close()
|
||||
if manager2:
|
||||
await manager2.close()
|
||||
# Make sure container is removed
|
||||
if container_id1:
|
||||
await docker_utils.remove_container(container_id1, force=True)
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
finally:
|
||||
# Clean up registry directory
|
||||
if os.path.exists(registry_dir):
|
||||
shutil.rmtree(registry_dir)
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
logger.info("Starting Docker Browser Strategy tests", tag="TEST")
|
||||
|
||||
# Check if Docker is available
|
||||
if not await check_docker_available():
|
||||
logger.error("Docker is not available - skipping tests", tag="TEST")
|
||||
return
|
||||
|
||||
# First test Docker components
|
||||
# setup_result = await test_docker_components()
|
||||
# if not setup_result:
|
||||
# logger.error("Docker component tests failed - skipping browser tests", tag="TEST")
|
||||
# return
|
||||
|
||||
# Run browser tests
|
||||
results.append(await test_docker_connect_mode())
|
||||
results.append(await test_docker_launch_mode())
|
||||
results.append(await test_docker_persistent_storage())
|
||||
results.append(await test_docker_parallel_pages())
|
||||
results.append(await test_docker_registry_reuse())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(1 for r in results if r)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
async def check_docker_available() -> bool:
|
||||
"""Check if Docker is available on the system.
|
||||
|
||||
Returns:
|
||||
bool: True if Docker is available, False otherwise
|
||||
"""
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"docker", "--version",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, _ = await proc.communicate()
|
||||
return proc.returncode == 0 and stdout
|
||||
except:
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
525
tests/browser/manager/demo_browser_manager.py
Normal file
525
tests/browser/manager/demo_browser_manager.py
Normal file
@@ -0,0 +1,525 @@
|
||||
"""Demo script for testing the enhanced BrowserManager.
|
||||
|
||||
This script demonstrates the browser pooling capabilities of the enhanced
|
||||
BrowserManager with various configurations and usage patterns.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
import random
|
||||
|
||||
from crawl4ai.browser.manager import BrowserManager, UnavailableBehavior
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
import playwright
|
||||
|
||||
SAFE_URLS = [
|
||||
"https://example.com",
|
||||
"https://example.com/page1",
|
||||
"https://httpbin.org/get",
|
||||
"https://httpbin.org/html",
|
||||
"https://httpbin.org/ip",
|
||||
"https://httpbin.org/user-agent",
|
||||
"https://httpbin.org/headers",
|
||||
"https://httpbin.org/cookies",
|
||||
"https://httpstat.us/200",
|
||||
"https://httpstat.us/301",
|
||||
"https://httpstat.us/404",
|
||||
"https://httpstat.us/500",
|
||||
"https://jsonplaceholder.typicode.com/posts/1",
|
||||
"https://jsonplaceholder.typicode.com/posts/2",
|
||||
"https://jsonplaceholder.typicode.com/posts/3",
|
||||
"https://jsonplaceholder.typicode.com/posts/4",
|
||||
"https://jsonplaceholder.typicode.com/posts/5",
|
||||
"https://jsonplaceholder.typicode.com/comments/1",
|
||||
"https://jsonplaceholder.typicode.com/comments/2",
|
||||
"https://jsonplaceholder.typicode.com/users/1",
|
||||
"https://jsonplaceholder.typicode.com/users/2",
|
||||
"https://jsonplaceholder.typicode.com/albums/1",
|
||||
"https://jsonplaceholder.typicode.com/albums/2",
|
||||
"https://jsonplaceholder.typicode.com/photos/1",
|
||||
"https://jsonplaceholder.typicode.com/photos/2",
|
||||
"https://jsonplaceholder.typicode.com/todos/1",
|
||||
"https://jsonplaceholder.typicode.com/todos/2",
|
||||
"https://www.iana.org",
|
||||
"https://www.iana.org/domains",
|
||||
"https://www.iana.org/numbers",
|
||||
"https://www.iana.org/protocols",
|
||||
"https://www.iana.org/about",
|
||||
"https://www.iana.org/time-zones",
|
||||
"https://www.data.gov",
|
||||
"https://catalog.data.gov/dataset",
|
||||
"https://www.archives.gov",
|
||||
"https://www.usa.gov",
|
||||
"https://www.loc.gov",
|
||||
"https://www.irs.gov",
|
||||
"https://www.census.gov",
|
||||
"https://www.bls.gov",
|
||||
"https://www.gpo.gov",
|
||||
"https://www.w3.org",
|
||||
"https://www.w3.org/standards",
|
||||
"https://www.w3.org/WAI",
|
||||
"https://www.rfc-editor.org",
|
||||
"https://www.ietf.org",
|
||||
"https://www.icann.org",
|
||||
"https://www.internetsociety.org",
|
||||
"https://www.python.org"
|
||||
]
|
||||
|
||||
async def basic_pooling_demo():
|
||||
"""Demonstrate basic browser pooling functionality."""
|
||||
print("\n=== Basic Browser Pooling Demo ===")
|
||||
|
||||
# Create logger
|
||||
logger = AsyncLogger(verbose=True)
|
||||
|
||||
# Create browser configurations
|
||||
config1 = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
browser_mode="playwright"
|
||||
)
|
||||
|
||||
config2 = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
browser_mode="cdp"
|
||||
)
|
||||
|
||||
# Create browser manager with on-demand behavior
|
||||
manager = BrowserManager(
|
||||
browser_config=config1,
|
||||
logger=logger,
|
||||
unavailable_behavior=UnavailableBehavior.ON_DEMAND,
|
||||
max_browsers_per_config=3
|
||||
)
|
||||
|
||||
try:
|
||||
# Initialize pool with both configurations
|
||||
print("Initializing browser pool...")
|
||||
await manager.initialize_pool(
|
||||
browser_configs=[config1, config2],
|
||||
browsers_per_config=2
|
||||
)
|
||||
|
||||
# Display initial pool status
|
||||
status = await manager.get_pool_status()
|
||||
print(f"Initial pool status: {status}")
|
||||
|
||||
# Create crawler run configurations
|
||||
run_config1 = CrawlerRunConfig()
|
||||
run_config2 = CrawlerRunConfig()
|
||||
|
||||
# Simulate concurrent page requests
|
||||
print("\nGetting pages for parallel crawling...")
|
||||
|
||||
# Function to simulate crawling
|
||||
async def simulate_crawl(index: int, config: BrowserConfig, run_config: CrawlerRunConfig):
|
||||
print(f"Crawler {index}: Requesting page...")
|
||||
page, context, strategy = await manager.get_page(run_config, config)
|
||||
print(f"Crawler {index}: Got page, navigating to example.com...")
|
||||
|
||||
try:
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
print(f"Crawler {index}: Page title: {title}")
|
||||
|
||||
# Simulate work
|
||||
await asyncio.sleep(random.uniform(1, 3))
|
||||
print(f"Crawler {index}: Work completed, releasing page...")
|
||||
|
||||
# Check dynamic page content
|
||||
content = await page.content()
|
||||
content_length = len(content)
|
||||
print(f"Crawler {index}: Page content length: {content_length}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Crawler {index}: Error: {str(e)}")
|
||||
finally:
|
||||
# Release the page
|
||||
await manager.release_page(page, strategy, config)
|
||||
print(f"Crawler {index}: Page released")
|
||||
|
||||
# Create 5 parallel crawls
|
||||
crawl_tasks = []
|
||||
for i in range(5):
|
||||
# Alternate between configurations
|
||||
config = config1 if i % 2 == 0 else config2
|
||||
run_config = run_config1 if i % 2 == 0 else run_config2
|
||||
|
||||
task = asyncio.create_task(simulate_crawl(i+1, config, run_config))
|
||||
crawl_tasks.append(task)
|
||||
|
||||
# Wait for all crawls to complete
|
||||
await asyncio.gather(*crawl_tasks)
|
||||
|
||||
# Display final pool status
|
||||
status = await manager.get_pool_status()
|
||||
print(f"\nFinal pool status: {status}")
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
print("\nClosing browser manager...")
|
||||
await manager.close()
|
||||
print("Browser manager closed")
|
||||
|
||||
|
||||
async def prewarm_pages_demo():
|
||||
"""Demonstrate page pre-warming functionality."""
|
||||
print("\n=== Page Pre-warming Demo ===")
|
||||
|
||||
# Create logger
|
||||
logger = AsyncLogger(verbose=True)
|
||||
|
||||
# Create browser configuration
|
||||
config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
browser_mode="playwright"
|
||||
)
|
||||
|
||||
# Create crawler run configurations for pre-warming
|
||||
run_config1 = CrawlerRunConfig(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
|
||||
run_config2 = CrawlerRunConfig(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15"
|
||||
)
|
||||
|
||||
# Create page pre-warm configurations
|
||||
page_configs = [
|
||||
(config, run_config1, 2), # 2 pages with run_config1
|
||||
(config, run_config2, 3) # 3 pages with run_config2
|
||||
]
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(
|
||||
browser_config=config,
|
||||
logger=logger,
|
||||
unavailable_behavior=UnavailableBehavior.EXCEPTION
|
||||
)
|
||||
|
||||
try:
|
||||
# Initialize pool with pre-warmed pages
|
||||
print("Initializing browser pool with pre-warmed pages...")
|
||||
await manager.initialize_pool(
|
||||
browser_configs=[config],
|
||||
browsers_per_config=2,
|
||||
page_configs=page_configs
|
||||
)
|
||||
|
||||
# Display pool status
|
||||
status = await manager.get_pool_status()
|
||||
print(f"Pool status after pre-warming: {status}")
|
||||
|
||||
# Simulate using pre-warmed pages
|
||||
print("\nUsing pre-warmed pages...")
|
||||
|
||||
async def use_prewarm_page(index: int, run_config: CrawlerRunConfig):
|
||||
print(f"Task {index}: Requesting pre-warmed page...")
|
||||
page, context, strategy = await manager.get_page(run_config, config)
|
||||
|
||||
try:
|
||||
print(f"Task {index}: Got page, navigating to example.com...")
|
||||
await page.goto("https://example.com")
|
||||
|
||||
# Verify user agent was applied correctly
|
||||
user_agent = await page.evaluate("() => navigator.userAgent")
|
||||
print(f"Task {index}: User agent: {user_agent}")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
print(f"Task {index}: Page title: {title}")
|
||||
|
||||
# Simulate work
|
||||
await asyncio.sleep(1)
|
||||
finally:
|
||||
# Release the page
|
||||
print(f"Task {index}: Releasing page...")
|
||||
await manager.release_page(page, strategy, config)
|
||||
|
||||
# Create tasks to use pre-warmed pages
|
||||
tasks = []
|
||||
# Use run_config1 pages
|
||||
for i in range(2):
|
||||
tasks.append(asyncio.create_task(use_prewarm_page(i+1, run_config1)))
|
||||
|
||||
# Use run_config2 pages
|
||||
for i in range(3):
|
||||
tasks.append(asyncio.create_task(use_prewarm_page(i+3, run_config2)))
|
||||
|
||||
# Wait for all tasks to complete
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
# Try to use more pages than we pre-warmed (should raise exception)
|
||||
print("\nTrying to use more pages than pre-warmed...")
|
||||
try:
|
||||
page, context, strategy = await manager.get_page(run_config1, config)
|
||||
try:
|
||||
print("Got extra page (unexpected)")
|
||||
await page.goto("https://example.com")
|
||||
finally:
|
||||
await manager.release_page(page, strategy, config)
|
||||
except Exception as e:
|
||||
print(f"Expected exception when requesting more pages: {str(e)}")
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
print("\nClosing browser manager...")
|
||||
await manager.close()
|
||||
print("Browser manager closed")
|
||||
|
||||
|
||||
async def prewarm_on_demand_demo():
|
||||
"""Demonstrate pre-warming with on-demand browser creation."""
|
||||
print("\n=== Pre-warming with On-Demand Browser Creation Demo ===")
|
||||
|
||||
# Create logger
|
||||
logger = AsyncLogger(verbose=True)
|
||||
|
||||
# Create browser configuration
|
||||
config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
browser_mode="playwright"
|
||||
)
|
||||
|
||||
# Create crawler run configurations
|
||||
run_config = CrawlerRunConfig(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
|
||||
# Create page pre-warm configurations - just pre-warm 2 pages
|
||||
page_configs = [
|
||||
(config, run_config, 2)
|
||||
]
|
||||
|
||||
# Create browser manager with ON_DEMAND behavior
|
||||
manager = BrowserManager(
|
||||
browser_config=config,
|
||||
logger=logger,
|
||||
unavailable_behavior=UnavailableBehavior.ON_DEMAND,
|
||||
max_browsers_per_config=5 # Allow up to 5 browsers
|
||||
)
|
||||
|
||||
try:
|
||||
# Initialize pool with pre-warmed pages
|
||||
print("Initializing browser pool with pre-warmed pages...")
|
||||
await manager.initialize_pool(
|
||||
browser_configs=[config],
|
||||
browsers_per_config=1, # Start with just 1 browser
|
||||
page_configs=page_configs
|
||||
)
|
||||
|
||||
# Display initial pool status
|
||||
status = await manager.get_pool_status()
|
||||
print(f"Initial pool status: {status}")
|
||||
|
||||
# Simulate using more pages than pre-warmed - should create browsers on demand
|
||||
print("\nUsing more pages than pre-warmed (should create on demand)...")
|
||||
|
||||
async def use_page(index: int):
|
||||
print(f"Task {index}: Requesting page...")
|
||||
page, context, strategy = await manager.get_page(run_config, config)
|
||||
|
||||
try:
|
||||
print(f"Task {index}: Got page, navigating to example.com...")
|
||||
await page.goto("https://example.com")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
print(f"Task {index}: Page title: {title}")
|
||||
|
||||
# Simulate work for a varying amount of time
|
||||
work_time = 1 + (index * 0.5) # Stagger completion times
|
||||
print(f"Task {index}: Working for {work_time} seconds...")
|
||||
await asyncio.sleep(work_time)
|
||||
print(f"Task {index}: Work completed")
|
||||
finally:
|
||||
# Release the page
|
||||
print(f"Task {index}: Releasing page...")
|
||||
await manager.release_page(page, strategy, config)
|
||||
|
||||
# Create more tasks than pre-warmed pages
|
||||
tasks = []
|
||||
for i in range(5): # Try to use 5 pages when only 2 are pre-warmed
|
||||
tasks.append(asyncio.create_task(use_page(i+1)))
|
||||
|
||||
# Wait for all tasks to complete
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
# Display final pool status - should show on-demand created browsers
|
||||
status = await manager.get_pool_status()
|
||||
print(f"\nFinal pool status: {status}")
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
print("\nClosing browser manager...")
|
||||
await manager.close()
|
||||
print("Browser manager closed")
|
||||
|
||||
|
||||
async def high_volume_demo():
|
||||
"""Demonstrate high-volume access to pre-warmed pages."""
|
||||
print("\n=== High Volume Pre-warmed Pages Demo ===")
|
||||
|
||||
# Create logger
|
||||
logger = AsyncLogger(verbose=True)
|
||||
|
||||
# Create browser configuration
|
||||
config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
browser_mode="playwright"
|
||||
)
|
||||
|
||||
# Create crawler run configuration
|
||||
run_config = CrawlerRunConfig(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
|
||||
# Set up dimensions
|
||||
browser_count = 10
|
||||
pages_per_browser = 5
|
||||
total_pages = browser_count * pages_per_browser
|
||||
|
||||
# Create page pre-warm configuration
|
||||
page_configs = [
|
||||
(config, run_config, total_pages)
|
||||
]
|
||||
|
||||
print(f"Preparing {browser_count} browsers with {pages_per_browser} pages each ({total_pages} total pages)")
|
||||
|
||||
# Create browser manager with ON_DEMAND behavior as fallback
|
||||
# No need to specify max_browsers_per_config as it will be calculated automatically
|
||||
manager = BrowserManager(
|
||||
browser_config=config,
|
||||
logger=logger,
|
||||
unavailable_behavior=UnavailableBehavior.ON_DEMAND
|
||||
)
|
||||
|
||||
try:
|
||||
# Initialize pool with browsers and pre-warmed pages
|
||||
print(f"Pre-warming {total_pages} pages...")
|
||||
start_time = time.time()
|
||||
await manager.initialize_pool(
|
||||
browser_configs=[config],
|
||||
browsers_per_config=browser_count,
|
||||
page_configs=page_configs
|
||||
)
|
||||
warmup_time = time.time() - start_time
|
||||
print(f"Pre-warming completed in {warmup_time:.2f} seconds")
|
||||
|
||||
# Display pool status
|
||||
status = await manager.get_pool_status()
|
||||
print(f"Pool status after pre-warming: {status}")
|
||||
|
||||
# Simulate using all pre-warmed pages simultaneously
|
||||
print(f"\nSending {total_pages} crawl requests simultaneously...")
|
||||
|
||||
async def crawl_page(index: int):
|
||||
# url = f"https://example.com/page{index}"
|
||||
url = SAFE_URLS[index % len(SAFE_URLS)]
|
||||
print(f"Page {index}: Requesting page...")
|
||||
# Measure time to acquire page
|
||||
page_start = time.time()
|
||||
page, context, strategy = await manager.get_page(run_config, config)
|
||||
page_acquisition_time = time.time() - page_start
|
||||
|
||||
try:
|
||||
# Navigate to the URL
|
||||
nav_start = time.time()
|
||||
await page.goto(url, timeout=5000)
|
||||
navigation_time = time.time() - nav_start
|
||||
|
||||
# Get the page title
|
||||
title = await page.title()
|
||||
|
||||
return {
|
||||
"index": index,
|
||||
"url": url,
|
||||
"title": title,
|
||||
"page_acquisition_time": page_acquisition_time,
|
||||
"navigation_time": navigation_time
|
||||
}
|
||||
except playwright._impl._errors.TimeoutError as e:
|
||||
# print(f"Page {index}: Navigation timed out - {e}")
|
||||
return {
|
||||
"index": index,
|
||||
"url": url,
|
||||
"title": "Navigation timed out",
|
||||
"page_acquisition_time": page_acquisition_time,
|
||||
"navigation_time": 0
|
||||
}
|
||||
finally:
|
||||
# Release the page
|
||||
await manager.release_page(page, strategy, config)
|
||||
|
||||
# Create and execute all tasks simultaneously
|
||||
start_time = time.time()
|
||||
|
||||
# Non-parallel way
|
||||
# for i in range(total_pages):
|
||||
# await crawl_page(i+1)
|
||||
|
||||
tasks = [crawl_page(i+1) for i in range(total_pages)]
|
||||
results = await asyncio.gather(*tasks)
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# # Print all titles
|
||||
# for result in results:
|
||||
# print(f"Page {result['index']} ({result['url']}): Title: {result['title']}")
|
||||
# print(f" Page acquisition time: {result['page_acquisition_time']:.4f}s")
|
||||
# print(f" Navigation time: {result['navigation_time']:.4f}s")
|
||||
# print(f" Total time: {result['page_acquisition_time'] + result['navigation_time']:.4f}s")
|
||||
# print("-" * 40)
|
||||
|
||||
# Report results
|
||||
print(f"\nAll {total_pages} crawls completed in {total_time:.2f} seconds")
|
||||
|
||||
# Calculate statistics
|
||||
acquisition_times = [r["page_acquisition_time"] for r in results]
|
||||
navigation_times = [r["navigation_time"] for r in results]
|
||||
|
||||
avg_acquisition = sum(acquisition_times) / len(acquisition_times)
|
||||
max_acquisition = max(acquisition_times)
|
||||
min_acquisition = min(acquisition_times)
|
||||
|
||||
avg_navigation = sum(navigation_times) / len(navigation_times)
|
||||
max_navigation = max(navigation_times)
|
||||
min_navigation = min(navigation_times)
|
||||
|
||||
print("\nPage acquisition times:")
|
||||
print(f" Average: {avg_acquisition:.4f}s")
|
||||
print(f" Min: {min_acquisition:.4f}s")
|
||||
print(f" Max: {max_acquisition:.4f}s")
|
||||
|
||||
print("\nPage navigation times:")
|
||||
print(f" Average: {avg_navigation:.4f}s")
|
||||
print(f" Min: {min_navigation:.4f}s")
|
||||
print(f" Max: {max_navigation:.4f}s")
|
||||
|
||||
# Display final pool status
|
||||
status = await manager.get_pool_status()
|
||||
print(f"\nFinal pool status: {status}")
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
print("\nClosing browser manager...")
|
||||
await manager.close()
|
||||
print("Browser manager closed")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all demos."""
|
||||
# await basic_pooling_demo()
|
||||
# await prewarm_pages_demo()
|
||||
# await prewarm_on_demand_demo()
|
||||
await high_volume_demo()
|
||||
# Additional demo functions can be added here
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
190
tests/browser/test_browser_manager.py
Normal file
190
tests/browser/test_browser_manager.py
Normal file
@@ -0,0 +1,190 @@
|
||||
"""Test examples for BrowserManager.
|
||||
|
||||
These examples demonstrate the functionality of BrowserManager
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from typing import List
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_basic_browser_manager():
|
||||
"""Test basic BrowserManager functionality with default configuration."""
|
||||
logger.info("Starting test_basic_browser_manager", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create a browser manager with default config
|
||||
manager = BrowserManager(logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Page created successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_basic_browser_manager completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_basic_browser_manager failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def test_custom_browser_config():
|
||||
"""Test BrowserManager with custom browser configuration."""
|
||||
logger.info("Starting test_custom_browser_config", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create a custom browser config
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=800,
|
||||
light_mode=True
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully with custom config", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Verify viewport size
|
||||
viewport_size = await page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })")
|
||||
logger.info(f"Viewport size: {viewport_size}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_custom_browser_config completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_custom_browser_config failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def test_multiple_pages():
|
||||
"""Test BrowserManager with multiple pages."""
|
||||
logger.info("Starting test_multiple_pages", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create browser manager
|
||||
manager = BrowserManager(logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create multiple pages
|
||||
pages = []
|
||||
urls = ["https://example.com", "https://example.org", "https://mozilla.org"]
|
||||
|
||||
for i, url in enumerate(urls):
|
||||
crawler_config = CrawlerRunConfig(url=url)
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
await page.goto(url)
|
||||
pages.append((page, url))
|
||||
logger.info(f"Created page {i+1} for {url}", tag="TEST")
|
||||
|
||||
# Verify all pages are loaded correctly
|
||||
for i, (page, url) in enumerate(pages):
|
||||
title = await page.title()
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_multiple_pages completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_multiple_pages failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def test_session_management():
|
||||
"""Test session management in BrowserManager."""
|
||||
logger.info("Starting test_session_management", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create browser manager
|
||||
manager = BrowserManager(logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create a session
|
||||
session_id = "test_session_1"
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com", session_id=session_id)
|
||||
page1, context1 = await manager.get_page(crawler_config)
|
||||
await page1.goto("https://example.com")
|
||||
logger.info(f"Created session with ID: {session_id}", tag="TEST")
|
||||
|
||||
# Get the same session again
|
||||
page2, context2 = await manager.get_page(crawler_config)
|
||||
|
||||
# Verify it's the same page/context
|
||||
is_same_page = page1 == page2
|
||||
is_same_context = context1 == context2
|
||||
logger.info(f"Same page: {is_same_page}, Same context: {is_same_context}", tag="TEST")
|
||||
|
||||
# Kill the session
|
||||
await manager.kill_session(session_id)
|
||||
logger.info(f"Killed session with ID: {session_id}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_session_management completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_session_management failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_basic_browser_manager())
|
||||
results.append(await test_custom_browser_config())
|
||||
results.append(await test_multiple_pages())
|
||||
results.append(await test_session_management())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
809
tests/browser/test_builtin_browser.py
Normal file
809
tests/browser/test_builtin_browser.py
Normal file
@@ -0,0 +1,809 @@
|
||||
"""
|
||||
Test script for builtin browser functionality in the browser module.
|
||||
|
||||
This script tests:
|
||||
1. Creating a builtin browser
|
||||
2. Getting browser information
|
||||
3. Killing the browser
|
||||
4. Restarting the browser
|
||||
5. Testing operations with different browser strategies
|
||||
6. Testing edge cases
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import List, Dict, Any
|
||||
from colorama import Fore, Style, init
|
||||
|
||||
# Add the project root to the path for imports
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
|
||||
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from rich.text import Text
|
||||
from rich.box import Box, SIMPLE
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.browser.strategies import BuiltinBrowserStrategy
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Initialize colorama for cross-platform colored terminal output
|
||||
init()
|
||||
|
||||
# Define colors for pretty output
|
||||
SUCCESS = Fore.GREEN
|
||||
WARNING = Fore.YELLOW
|
||||
ERROR = Fore.RED
|
||||
INFO = Fore.CYAN
|
||||
RESET = Fore.RESET
|
||||
|
||||
# Create logger
|
||||
logger = AsyncLogger(verbose=True)
|
||||
|
||||
|
||||
async def test_builtin_browser_creation():
|
||||
"""Test creating a builtin browser using the BrowserManager with BuiltinBrowserStrategy"""
|
||||
print(f"\n{INFO}========== Testing Builtin Browser Creation =========={RESET}")
|
||||
|
||||
# Step 1: Create a BrowserManager with builtin mode
|
||||
print(f"\n{INFO}1. Creating BrowserManager with builtin mode{RESET}")
|
||||
browser_config = BrowserConfig(browser_mode="builtin", headless=True, verbose=True)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Step 2: Check if we have a BuiltinBrowserStrategy
|
||||
print(f"\n{INFO}2. Checking if we have a BuiltinBrowserStrategy{RESET}")
|
||||
if isinstance(manager.strategy, BuiltinBrowserStrategy):
|
||||
print(
|
||||
f"{SUCCESS}Correct strategy type: {manager.strategy.__class__.__name__}{RESET}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"{ERROR}Wrong strategy type: {manager.strategy.__class__.__name__}{RESET}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Step 3: Start the manager to launch or connect to builtin browser
|
||||
print(f"\n{INFO}3. Starting the browser manager{RESET}")
|
||||
try:
|
||||
await manager.start()
|
||||
print(f"{SUCCESS}Browser manager started successfully{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to start browser manager: {str(e)}{RESET}")
|
||||
return None
|
||||
|
||||
# Step 4: Get browser info from the strategy
|
||||
print(f"\n{INFO}4. Getting browser information{RESET}")
|
||||
browser_info = manager.strategy.get_browser_info()
|
||||
if browser_info:
|
||||
print(f"{SUCCESS}Browser info retrieved:{RESET}")
|
||||
for key, value in browser_info.items():
|
||||
if key != "config": # Skip the verbose config section
|
||||
print(f" {key}: {value}")
|
||||
|
||||
cdp_url = browser_info.get("cdp_url")
|
||||
print(f"{SUCCESS}CDP URL: {cdp_url}{RESET}")
|
||||
else:
|
||||
print(f"{ERROR}Failed to get browser information{RESET}")
|
||||
cdp_url = None
|
||||
|
||||
# Save manager for later tests
|
||||
return manager, cdp_url
|
||||
|
||||
|
||||
async def test_page_operations(manager: BrowserManager):
|
||||
"""Test page operations with the builtin browser"""
|
||||
print(
|
||||
f"\n{INFO}========== Testing Page Operations with Builtin Browser =========={RESET}"
|
||||
)
|
||||
|
||||
# Step 1: Get a single page
|
||||
print(f"\n{INFO}1. Getting a single page{RESET}")
|
||||
try:
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
print(f"{SUCCESS}Got page successfully{RESET}")
|
||||
|
||||
# Navigate to a test URL
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
print(f"{SUCCESS}Page title: {title}{RESET}")
|
||||
|
||||
# Close the page
|
||||
await page.close()
|
||||
print(f"{SUCCESS}Page closed successfully{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Page operation failed: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
# Step 2: Get multiple pages
|
||||
print(f"\n{INFO}2. Getting multiple pages with get_pages(){RESET}")
|
||||
try:
|
||||
# Request 3 pages
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=3)
|
||||
print(f"{SUCCESS}Got {len(pages)} pages{RESET}")
|
||||
|
||||
# Test each page
|
||||
for i, (page, context) in enumerate(pages):
|
||||
await page.goto(f"https://example.com?test={i}")
|
||||
title = await page.title()
|
||||
print(f"{SUCCESS}Page {i + 1} title: {title}{RESET}")
|
||||
await page.close()
|
||||
|
||||
print(f"{SUCCESS}All pages tested and closed successfully{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Multiple page operation failed: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def test_browser_status_management(manager: BrowserManager):
|
||||
"""Test browser status and management operations"""
|
||||
print(f"\n{INFO}========== Testing Browser Status and Management =========={RESET}")
|
||||
|
||||
# Step 1: Get browser status
|
||||
print(f"\n{INFO}1. Getting browser status{RESET}")
|
||||
try:
|
||||
status = await manager.strategy.get_builtin_browser_status()
|
||||
print(f"{SUCCESS}Browser status:{RESET}")
|
||||
print(f" Running: {status['running']}")
|
||||
print(f" CDP URL: {status['cdp_url']}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
# Step 2: Test killing the browser
|
||||
print(f"\n{INFO}2. Testing killing the browser{RESET}")
|
||||
try:
|
||||
result = await manager.strategy.kill_builtin_browser()
|
||||
if result:
|
||||
print(f"{SUCCESS}Browser killed successfully{RESET}")
|
||||
else:
|
||||
print(f"{ERROR}Failed to kill browser{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Browser kill operation failed: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
# Step 3: Check status after kill
|
||||
print(f"\n{INFO}3. Checking status after kill{RESET}")
|
||||
try:
|
||||
status = await manager.strategy.get_builtin_browser_status()
|
||||
if not status["running"]:
|
||||
print(f"{SUCCESS}Browser is correctly reported as not running{RESET}")
|
||||
else:
|
||||
print(f"{ERROR}Browser is incorrectly reported as still running{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
# Step 4: Launch a new browser
|
||||
print(f"\n{INFO}4. Launching a new browser{RESET}")
|
||||
try:
|
||||
cdp_url = await manager.strategy.launch_builtin_browser(
|
||||
browser_type="chromium", headless=True
|
||||
)
|
||||
if cdp_url:
|
||||
print(f"{SUCCESS}New browser launched at: {cdp_url}{RESET}")
|
||||
else:
|
||||
print(f"{ERROR}Failed to launch new browser{RESET}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Browser launch failed: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def test_multiple_managers():
|
||||
"""Test creating multiple BrowserManagers that use the same builtin browser"""
|
||||
print(f"\n{INFO}========== Testing Multiple Browser Managers =========={RESET}")
|
||||
|
||||
# Step 1: Create first manager
|
||||
print(f"\n{INFO}1. Creating first browser manager{RESET}")
|
||||
browser_config1 = BrowserConfig(browser_mode="builtin", headless=True)
|
||||
manager1 = BrowserManager(browser_config=browser_config1, logger=logger)
|
||||
|
||||
# Step 2: Create second manager
|
||||
print(f"\n{INFO}2. Creating second browser manager{RESET}")
|
||||
browser_config2 = BrowserConfig(browser_mode="builtin", headless=True)
|
||||
manager2 = BrowserManager(browser_config=browser_config2, logger=logger)
|
||||
|
||||
# Step 3: Start both managers (should connect to the same builtin browser)
|
||||
print(f"\n{INFO}3. Starting both managers{RESET}")
|
||||
try:
|
||||
await manager1.start()
|
||||
print(f"{SUCCESS}First manager started{RESET}")
|
||||
|
||||
await manager2.start()
|
||||
print(f"{SUCCESS}Second manager started{RESET}")
|
||||
|
||||
# Check if they got the same CDP URL
|
||||
cdp_url1 = manager1.strategy.config.cdp_url
|
||||
cdp_url2 = manager2.strategy.config.cdp_url
|
||||
|
||||
if cdp_url1 == cdp_url2:
|
||||
print(
|
||||
f"{SUCCESS}Both managers connected to the same browser: {cdp_url1}{RESET}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"{WARNING}Managers connected to different browsers: {cdp_url1} and {cdp_url2}{RESET}"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to start managers: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
# Step 4: Test using both managers
|
||||
print(f"\n{INFO}4. Testing operations with both managers{RESET}")
|
||||
try:
|
||||
# First manager creates a page
|
||||
page1, ctx1 = await manager1.get_page(CrawlerRunConfig())
|
||||
await page1.goto("https://example.com")
|
||||
title1 = await page1.title()
|
||||
print(f"{SUCCESS}Manager 1 page title: {title1}{RESET}")
|
||||
|
||||
# Second manager creates a page
|
||||
page2, ctx2 = await manager2.get_page(CrawlerRunConfig())
|
||||
await page2.goto("https://example.org")
|
||||
title2 = await page2.title()
|
||||
print(f"{SUCCESS}Manager 2 page title: {title2}{RESET}")
|
||||
|
||||
# Clean up
|
||||
await page1.close()
|
||||
await page2.close()
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to use both managers: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
# Step 5: Close both managers
|
||||
print(f"\n{INFO}5. Closing both managers{RESET}")
|
||||
try:
|
||||
await manager1.close()
|
||||
print(f"{SUCCESS}First manager closed{RESET}")
|
||||
|
||||
await manager2.close()
|
||||
print(f"{SUCCESS}Second manager closed{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to close managers: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def test_edge_cases():
|
||||
"""Test edge cases like multiple starts, killing browser during operations, etc."""
|
||||
print(f"\n{INFO}========== Testing Edge Cases =========={RESET}")
|
||||
|
||||
# Step 1: Test multiple starts with the same manager
|
||||
print(f"\n{INFO}1. Testing multiple starts with the same manager{RESET}")
|
||||
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
print(f"{SUCCESS}First start successful{RESET}")
|
||||
|
||||
# Try to start again
|
||||
await manager.start()
|
||||
print(f"{SUCCESS}Second start completed without errors{RESET}")
|
||||
|
||||
# Test if it's still functional
|
||||
page, context = await manager.get_page(CrawlerRunConfig())
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
print(
|
||||
f"{SUCCESS}Page operations work after multiple starts. Title: {title}{RESET}"
|
||||
)
|
||||
await page.close()
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Multiple starts test failed: {str(e)}{RESET}")
|
||||
return False
|
||||
finally:
|
||||
await manager.close()
|
||||
|
||||
# Step 2: Test killing the browser while manager is active
|
||||
print(f"\n{INFO}2. Testing killing the browser while manager is active{RESET}")
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
print(f"{SUCCESS}Manager started{RESET}")
|
||||
|
||||
# Kill the browser directly
|
||||
print(f"{INFO}Killing the browser...{RESET}")
|
||||
await manager.strategy.kill_builtin_browser()
|
||||
print(f"{SUCCESS}Browser killed{RESET}")
|
||||
|
||||
# Try to get a page (should fail or launch a new browser)
|
||||
try:
|
||||
page, context = await manager.get_page(CrawlerRunConfig())
|
||||
print(
|
||||
f"{WARNING}Page request succeeded despite killed browser (might have auto-restarted){RESET}"
|
||||
)
|
||||
title = await page.title()
|
||||
print(f"{SUCCESS}Got page title: {title}{RESET}")
|
||||
await page.close()
|
||||
except Exception as e:
|
||||
print(
|
||||
f"{SUCCESS}Page request failed as expected after browser was killed: {str(e)}{RESET}"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Kill during operation test failed: {str(e)}{RESET}")
|
||||
return False
|
||||
finally:
|
||||
await manager.close()
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def cleanup_browsers():
|
||||
"""Clean up any remaining builtin browsers"""
|
||||
print(f"\n{INFO}========== Cleaning Up Builtin Browsers =========={RESET}")
|
||||
|
||||
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# No need to start, just access the strategy directly
|
||||
strategy = manager.strategy
|
||||
if isinstance(strategy, BuiltinBrowserStrategy):
|
||||
result = await strategy.kill_builtin_browser()
|
||||
if result:
|
||||
print(f"{SUCCESS}Successfully killed all builtin browsers{RESET}")
|
||||
else:
|
||||
print(f"{WARNING}No builtin browsers found to kill{RESET}")
|
||||
else:
|
||||
print(f"{ERROR}Wrong strategy type: {strategy.__class__.__name__}{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Cleanup failed: {str(e)}{RESET}")
|
||||
finally:
|
||||
# Just to be safe
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
async def test_performance_scaling():
|
||||
"""Test performance with multiple browsers and pages.
|
||||
|
||||
This test creates multiple browsers on different ports,
|
||||
spawns multiple pages per browser, and measures performance metrics.
|
||||
"""
|
||||
print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}")
|
||||
|
||||
# Configuration parameters
|
||||
num_browsers = 10
|
||||
pages_per_browser = 10
|
||||
total_pages = num_browsers * pages_per_browser
|
||||
base_port = 9222
|
||||
|
||||
# Set up a measuring mechanism for memory
|
||||
import psutil
|
||||
import gc
|
||||
|
||||
# Force garbage collection before starting
|
||||
gc.collect()
|
||||
process = psutil.Process()
|
||||
initial_memory = process.memory_info().rss / 1024 / 1024 # in MB
|
||||
peak_memory = initial_memory
|
||||
|
||||
# Report initial configuration
|
||||
print(
|
||||
f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}"
|
||||
)
|
||||
|
||||
# List to track managers
|
||||
managers: List[BrowserManager] = []
|
||||
all_pages = []
|
||||
|
||||
|
||||
|
||||
# Get crawl4ai home directory
|
||||
crawl4ai_home = os.path.expanduser("~/.crawl4ai")
|
||||
temp_dir = os.path.join(crawl4ai_home, "temp")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
# Create all managers but don't start them yet
|
||||
manager_configs = []
|
||||
for i in range(num_browsers):
|
||||
port = base_port + i
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True,
|
||||
debugging_port=port,
|
||||
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
|
||||
)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
manager.strategy.shutting_down = True
|
||||
manager_configs.append((manager, i, port))
|
||||
|
||||
# Define async function to start a single manager
|
||||
async def start_manager(manager, index, port):
|
||||
try:
|
||||
await manager.start()
|
||||
return manager
|
||||
except Exception as e:
|
||||
print(
|
||||
f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Start all managers in parallel
|
||||
start_tasks = [
|
||||
start_manager(manager, i, port) for manager, i, port in manager_configs
|
||||
]
|
||||
started_managers = await asyncio.gather(*start_tasks)
|
||||
|
||||
# Filter out None values (failed starts) and add to managers list
|
||||
managers = [m for m in started_managers if m is not None]
|
||||
|
||||
if len(managers) == 0:
|
||||
print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}")
|
||||
return False
|
||||
|
||||
if len(managers) < num_browsers:
|
||||
print(
|
||||
f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}"
|
||||
)
|
||||
|
||||
# Create pages for each browser
|
||||
for i, manager in enumerate(managers):
|
||||
try:
|
||||
pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser)
|
||||
all_pages.extend(pages)
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}")
|
||||
|
||||
# Check memory after page creation
|
||||
gc.collect()
|
||||
current_memory = process.memory_info().rss / 1024 / 1024
|
||||
peak_memory = max(peak_memory, current_memory)
|
||||
|
||||
# Ask for confirmation before loading
|
||||
confirmation = input(
|
||||
f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}"
|
||||
)
|
||||
# Step 1: Create and start multiple browser managers in parallel
|
||||
start_time = time.time()
|
||||
|
||||
if confirmation.lower() == "y":
|
||||
load_start_time = time.time()
|
||||
|
||||
# Function to load a single page
|
||||
async def load_page(page_ctx, index):
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(f"https://example.com/page{index}", timeout=30000)
|
||||
title = await page.title()
|
||||
return title
|
||||
except Exception as e:
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
# Load all pages concurrently
|
||||
load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)]
|
||||
load_results = await asyncio.gather(*load_tasks, return_exceptions=True)
|
||||
|
||||
# Count successes and failures
|
||||
successes = sum(
|
||||
1 for r in load_results if isinstance(r, str) and not r.startswith("Error")
|
||||
)
|
||||
failures = len(load_results) - successes
|
||||
|
||||
load_time = time.time() - load_start_time
|
||||
total_test_time = time.time() - start_time
|
||||
|
||||
# Check memory after loading (peak memory)
|
||||
gc.collect()
|
||||
current_memory = process.memory_info().rss / 1024 / 1024
|
||||
peak_memory = max(peak_memory, current_memory)
|
||||
|
||||
# Calculate key metrics
|
||||
memory_per_page = peak_memory / successes if successes > 0 else 0
|
||||
time_per_crawl = total_test_time / successes if successes > 0 else 0
|
||||
crawls_per_second = successes / total_test_time if total_test_time > 0 else 0
|
||||
crawls_per_minute = crawls_per_second * 60
|
||||
crawls_per_hour = crawls_per_minute * 60
|
||||
|
||||
# Print simplified performance summary
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
console = Console()
|
||||
|
||||
# Create a simple summary table
|
||||
table = Table(title="CRAWL4AI PERFORMANCE SUMMARY")
|
||||
|
||||
table.add_column("Metric", style="cyan")
|
||||
table.add_column("Value", style="green")
|
||||
|
||||
table.add_row("Total Crawls Completed", f"{successes}")
|
||||
table.add_row("Total Time", f"{total_test_time:.2f} seconds")
|
||||
table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds")
|
||||
table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second")
|
||||
table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls")
|
||||
table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls")
|
||||
table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB")
|
||||
table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB")
|
||||
|
||||
# Display the table
|
||||
console.print(table)
|
||||
|
||||
# Ask confirmation before cleanup
|
||||
confirmation = input(
|
||||
f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}"
|
||||
)
|
||||
if confirmation.lower() != "y":
|
||||
print(f"{WARNING}Cleanup aborted by user{RESET}")
|
||||
return False
|
||||
|
||||
# Close all pages
|
||||
for page, _ in all_pages:
|
||||
try:
|
||||
await page.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Close all managers
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Remove the temp directory
|
||||
import shutil
|
||||
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def test_performance_scaling_lab( num_browsers: int = 10, pages_per_browser: int = 10):
|
||||
"""Test performance with multiple browsers and pages.
|
||||
|
||||
This test creates multiple browsers on different ports,
|
||||
spawns multiple pages per browser, and measures performance metrics.
|
||||
"""
|
||||
print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}")
|
||||
|
||||
# Configuration parameters
|
||||
num_browsers = num_browsers
|
||||
pages_per_browser = pages_per_browser
|
||||
total_pages = num_browsers * pages_per_browser
|
||||
base_port = 9222
|
||||
|
||||
# Set up a measuring mechanism for memory
|
||||
import psutil
|
||||
import gc
|
||||
|
||||
# Force garbage collection before starting
|
||||
gc.collect()
|
||||
process = psutil.Process()
|
||||
initial_memory = process.memory_info().rss / 1024 / 1024 # in MB
|
||||
peak_memory = initial_memory
|
||||
|
||||
# Report initial configuration
|
||||
print(
|
||||
f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}"
|
||||
)
|
||||
|
||||
# List to track managers
|
||||
managers: List[BrowserManager] = []
|
||||
all_pages = []
|
||||
|
||||
# Get crawl4ai home directory
|
||||
crawl4ai_home = os.path.expanduser("~/.crawl4ai")
|
||||
temp_dir = os.path.join(crawl4ai_home, "temp")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
# Create all managers but don't start them yet
|
||||
manager_configs = []
|
||||
for i in range(num_browsers):
|
||||
port = base_port + i
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True,
|
||||
debugging_port=port,
|
||||
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
|
||||
)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
manager.strategy.shutting_down = True
|
||||
manager_configs.append((manager, i, port))
|
||||
|
||||
# Define async function to start a single manager
|
||||
async def start_manager(manager, index, port):
|
||||
try:
|
||||
await manager.start()
|
||||
return manager
|
||||
except Exception as e:
|
||||
print(
|
||||
f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Start all managers in parallel
|
||||
start_tasks = [
|
||||
start_manager(manager, i, port) for manager, i, port in manager_configs
|
||||
]
|
||||
started_managers = await asyncio.gather(*start_tasks)
|
||||
|
||||
# Filter out None values (failed starts) and add to managers list
|
||||
managers = [m for m in started_managers if m is not None]
|
||||
|
||||
if len(managers) == 0:
|
||||
print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}")
|
||||
return False
|
||||
|
||||
if len(managers) < num_browsers:
|
||||
print(
|
||||
f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}"
|
||||
)
|
||||
|
||||
# Create pages for each browser
|
||||
for i, manager in enumerate(managers):
|
||||
try:
|
||||
pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser)
|
||||
all_pages.extend(pages)
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}")
|
||||
|
||||
# Check memory after page creation
|
||||
gc.collect()
|
||||
current_memory = process.memory_info().rss / 1024 / 1024
|
||||
peak_memory = max(peak_memory, current_memory)
|
||||
|
||||
# Ask for confirmation before loading
|
||||
confirmation = input(
|
||||
f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}"
|
||||
)
|
||||
# Step 1: Create and start multiple browser managers in parallel
|
||||
start_time = time.time()
|
||||
|
||||
if confirmation.lower() == "y":
|
||||
load_start_time = time.time()
|
||||
|
||||
# Function to load a single page
|
||||
async def load_page(page_ctx, index):
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(f"https://example.com/page{index}", timeout=30000)
|
||||
title = await page.title()
|
||||
return title
|
||||
except Exception as e:
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
# Load all pages concurrently
|
||||
load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)]
|
||||
load_results = await asyncio.gather(*load_tasks, return_exceptions=True)
|
||||
|
||||
# Count successes and failures
|
||||
successes = sum(
|
||||
1 for r in load_results if isinstance(r, str) and not r.startswith("Error")
|
||||
)
|
||||
failures = len(load_results) - successes
|
||||
|
||||
load_time = time.time() - load_start_time
|
||||
total_test_time = time.time() - start_time
|
||||
|
||||
# Check memory after loading (peak memory)
|
||||
gc.collect()
|
||||
current_memory = process.memory_info().rss / 1024 / 1024
|
||||
peak_memory = max(peak_memory, current_memory)
|
||||
|
||||
# Calculate key metrics
|
||||
memory_per_page = peak_memory / successes if successes > 0 else 0
|
||||
time_per_crawl = total_test_time / successes if successes > 0 else 0
|
||||
crawls_per_second = successes / total_test_time if total_test_time > 0 else 0
|
||||
crawls_per_minute = crawls_per_second * 60
|
||||
crawls_per_hour = crawls_per_minute * 60
|
||||
|
||||
# Print simplified performance summary
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
console = Console()
|
||||
|
||||
# Create a simple summary table
|
||||
table = Table(title="CRAWL4AI PERFORMANCE SUMMARY")
|
||||
|
||||
table.add_column("Metric", style="cyan")
|
||||
table.add_column("Value", style="green")
|
||||
|
||||
table.add_row("Total Crawls Completed", f"{successes}")
|
||||
table.add_row("Total Time", f"{total_test_time:.2f} seconds")
|
||||
table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds")
|
||||
table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second")
|
||||
table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls")
|
||||
table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls")
|
||||
table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB")
|
||||
table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB")
|
||||
|
||||
# Display the table
|
||||
console.print(table)
|
||||
|
||||
# Ask confirmation before cleanup
|
||||
confirmation = input(
|
||||
f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}"
|
||||
)
|
||||
if confirmation.lower() != "y":
|
||||
print(f"{WARNING}Cleanup aborted by user{RESET}")
|
||||
return False
|
||||
|
||||
# Close all pages
|
||||
for page, _ in all_pages:
|
||||
try:
|
||||
await page.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Close all managers
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Remove the temp directory
|
||||
import shutil
|
||||
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all tests"""
|
||||
try:
|
||||
print(f"{INFO}Starting builtin browser tests with browser module{RESET}")
|
||||
|
||||
# # Run browser creation test
|
||||
# manager, cdp_url = await test_builtin_browser_creation()
|
||||
# if not manager:
|
||||
# print(f"{ERROR}Browser creation failed, cannot continue tests{RESET}")
|
||||
# return
|
||||
|
||||
# # Run page operations test
|
||||
# await test_page_operations(manager)
|
||||
|
||||
# # Run browser status and management test
|
||||
# await test_browser_status_management(manager)
|
||||
|
||||
# # Close manager before multiple manager test
|
||||
# await manager.close()
|
||||
|
||||
# Run multiple managers test
|
||||
await test_multiple_managers()
|
||||
|
||||
# Run performance scaling test
|
||||
await test_performance_scaling()
|
||||
|
||||
# Run cleanup test
|
||||
await cleanup_browsers()
|
||||
|
||||
# Run edge cases test
|
||||
await test_edge_cases()
|
||||
|
||||
print(f"\n{SUCCESS}All tests completed!{RESET}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n{ERROR}Test failed with error: {str(e)}{RESET}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
# Clean up: kill any remaining builtin browsers
|
||||
await cleanup_browsers()
|
||||
print(f"{SUCCESS}Test cleanup complete{RESET}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
160
tests/browser/test_builtin_strategy.py
Normal file
160
tests/browser/test_builtin_strategy.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Test examples for BuiltinBrowserStrategy.
|
||||
|
||||
These examples demonstrate the functionality of BuiltinBrowserStrategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_builtin_browser():
|
||||
"""Test using a builtin browser that persists between sessions."""
|
||||
logger.info("Testing builtin browser", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start should connect to existing builtin browser or create one
|
||||
await manager.start()
|
||||
logger.info("Connected to builtin browser", tag="TEST")
|
||||
|
||||
# Test page creation
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Test navigation
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Close manager (should not close the builtin browser)
|
||||
await manager.close()
|
||||
logger.info("First session closed", tag="TEST")
|
||||
|
||||
# Create a second manager to verify browser persistence
|
||||
logger.info("Creating second session to verify persistence", tag="TEST")
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
await manager2.start()
|
||||
logger.info("Connected to existing builtin browser", tag="TEST")
|
||||
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
await page2.goto("https://example.org")
|
||||
title2 = await page2.title()
|
||||
logger.info(f"Second session page title: {title2}", tag="TEST")
|
||||
|
||||
await manager2.close()
|
||||
logger.info("Second session closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_builtin_browser_status():
|
||||
"""Test getting status of the builtin browser."""
|
||||
logger.info("Testing builtin browser status", tag="TEST")
|
||||
|
||||
from crawl4ai.browser.strategies import BuiltinBrowserStrategy
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True
|
||||
)
|
||||
|
||||
# Create strategy directly to access its status methods
|
||||
strategy = BuiltinBrowserStrategy(browser_config, logger)
|
||||
|
||||
try:
|
||||
# Get status before starting (should be not running)
|
||||
status_before = await strategy.get_builtin_browser_status()
|
||||
logger.info(f"Initial status: {status_before}", tag="TEST")
|
||||
|
||||
# Start the browser
|
||||
await strategy.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Get status after starting
|
||||
status_after = await strategy.get_builtin_browser_status()
|
||||
logger.info(f"Status after start: {status_after}", tag="TEST")
|
||||
|
||||
# Create a page to verify functionality
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await strategy.get_page(crawler_config)
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Close strategy (should not kill the builtin browser)
|
||||
await strategy.close()
|
||||
logger.info("Strategy closed successfully", tag="TEST")
|
||||
|
||||
# Create a new strategy object
|
||||
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
|
||||
|
||||
# Get status again (should still be running)
|
||||
status_final = await strategy2.get_builtin_browser_status()
|
||||
logger.info(f"Final status: {status_final}", tag="TEST")
|
||||
|
||||
# Verify that the status shows the browser is running
|
||||
is_running = status_final.get('running', False)
|
||||
logger.info(f"Builtin browser persistence confirmed: {is_running}", tag="TEST")
|
||||
|
||||
# Kill the builtin browser to clean up
|
||||
logger.info("Killing builtin browser", tag="TEST")
|
||||
success = await strategy2.kill_builtin_browser()
|
||||
logger.info(f"Killed builtin browser successfully: {success}", tag="TEST")
|
||||
|
||||
return is_running and success
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await strategy.close()
|
||||
|
||||
# Try to kill the builtin browser to clean up
|
||||
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
|
||||
await strategy2.kill_builtin_browser()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_builtin_browser())
|
||||
results.append(await test_builtin_browser_status())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
228
tests/browser/test_cdp_strategy.py
Normal file
228
tests/browser/test_cdp_strategy.py
Normal file
@@ -0,0 +1,228 @@
|
||||
"""Test examples for CDPBrowserStrategy.
|
||||
|
||||
These examples demonstrate the functionality of CDPBrowserStrategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_cdp_launch_connect():
|
||||
"""Test launching a browser and connecting via CDP."""
|
||||
logger.info("Testing launch and connect via CDP", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
browser_mode="cdp",
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched and connected via CDP", tag="TEST")
|
||||
|
||||
# Test with multiple pages
|
||||
pages = []
|
||||
for i in range(3):
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
await page.goto(f"https://example.com?test={i}")
|
||||
pages.append(page)
|
||||
logger.info(f"Created page {i+1}", tag="TEST")
|
||||
|
||||
# Verify all pages are working
|
||||
for i, page in enumerate(pages):
|
||||
title = await page.title()
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_cdp_with_user_data_dir():
|
||||
"""Test CDP browser with a user data directory."""
|
||||
logger.info("Testing CDP browser with user data directory", tag="TEST")
|
||||
|
||||
# Create a temporary user data directory
|
||||
import tempfile
|
||||
user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-test-")
|
||||
logger.info(f"Created temporary user data directory: {user_data_dir}", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
browser_mode="cdp",
|
||||
user_data_dir=user_data_dir
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched with user data directory", tag="TEST")
|
||||
|
||||
# Navigate to a page and store some data
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Set a cookie
|
||||
await context.add_cookies([{
|
||||
"name": "test_cookie",
|
||||
"value": "test_value",
|
||||
"url": "https://example.com"
|
||||
}])
|
||||
|
||||
# Visit the site
|
||||
await page.goto("https://example.com")
|
||||
|
||||
# Verify cookie was set
|
||||
cookies = await context.cookies(["https://example.com"])
|
||||
has_test_cookie = any(cookie["name"] == "test_cookie" for cookie in cookies)
|
||||
logger.info(f"Cookie set successfully: {has_test_cookie}", tag="TEST")
|
||||
|
||||
# Close the browser
|
||||
await manager.close()
|
||||
logger.info("First browser session closed", tag="TEST")
|
||||
|
||||
# Start a new browser with the same user data directory
|
||||
logger.info("Starting second browser session with same user data directory", tag="TEST")
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
await manager2.start()
|
||||
|
||||
# Get a new page and check if the cookie persists
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
await page2.goto("https://example.com")
|
||||
|
||||
# Verify cookie persisted
|
||||
cookies2 = await context2.cookies(["https://example.com"])
|
||||
has_test_cookie2 = any(cookie["name"] == "test_cookie" for cookie in cookies2)
|
||||
logger.info(f"Cookie persisted across sessions: {has_test_cookie2}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager2.close()
|
||||
|
||||
# Remove temporary directory
|
||||
import shutil
|
||||
shutil.rmtree(user_data_dir, ignore_errors=True)
|
||||
logger.info(f"Removed temporary user data directory", tag="TEST")
|
||||
|
||||
return has_test_cookie and has_test_cookie2
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Clean up temporary directory
|
||||
try:
|
||||
import shutil
|
||||
shutil.rmtree(user_data_dir, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
async def test_cdp_session_management():
|
||||
"""Test session management with CDP browser."""
|
||||
logger.info("Testing session management with CDP browser", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched successfully", tag="TEST")
|
||||
|
||||
# Create two sessions
|
||||
session1_id = "test_session_1"
|
||||
session2_id = "test_session_2"
|
||||
|
||||
# Set up first session
|
||||
crawler_config1 = CrawlerRunConfig(session_id=session1_id)
|
||||
page1, context1 = await manager.get_page(crawler_config1)
|
||||
await page1.goto("https://example.com")
|
||||
await page1.evaluate("localStorage.setItem('session1_data', 'test_value')")
|
||||
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
|
||||
|
||||
# Set up second session
|
||||
crawler_config2 = CrawlerRunConfig(session_id=session2_id)
|
||||
page2, context2 = await manager.get_page(crawler_config2)
|
||||
await page2.goto("https://example.org")
|
||||
await page2.evaluate("localStorage.setItem('session2_data', 'test_value2')")
|
||||
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
|
||||
|
||||
# Get first session again
|
||||
page1_again, _ = await manager.get_page(crawler_config1)
|
||||
|
||||
# Verify it's the same page and data persists
|
||||
is_same_page = page1 == page1_again
|
||||
data1 = await page1_again.evaluate("localStorage.getItem('session1_data')")
|
||||
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
|
||||
|
||||
# Kill first session
|
||||
await manager.kill_session(session1_id)
|
||||
logger.info(f"Killed session 1", tag="TEST")
|
||||
|
||||
# Verify second session still works
|
||||
data2 = await page2.evaluate("localStorage.getItem('session2_data')")
|
||||
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return is_same_page and data1 == "test_value" and data2 == "test_value2"
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
# results.append(await test_cdp_launch_connect())
|
||||
results.append(await test_cdp_with_user_data_dir())
|
||||
results.append(await test_cdp_session_management())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
77
tests/browser/test_combined.py
Normal file
77
tests/browser/test_combined.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Combined test runner for all browser module tests.
|
||||
|
||||
This script runs all the browser module tests in sequence and
|
||||
provides a comprehensive summary.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def run_test_module(module_name, header):
|
||||
"""Run all tests in a module and return results."""
|
||||
logger.info(f"\n{'-'*30}", tag="TEST")
|
||||
logger.info(f"RUNNING: {header}", tag="TEST")
|
||||
logger.info(f"{'-'*30}", tag="TEST")
|
||||
|
||||
# Import the module dynamically
|
||||
module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"])
|
||||
|
||||
# Track time for performance measurement
|
||||
start_time = time.time()
|
||||
|
||||
# Run the tests
|
||||
await module.run_tests()
|
||||
|
||||
# Calculate time taken
|
||||
time_taken = time.time() - start_time
|
||||
logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING")
|
||||
|
||||
return time_taken
|
||||
|
||||
async def main():
|
||||
"""Run all test modules."""
|
||||
logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN")
|
||||
|
||||
# List of test modules to run
|
||||
test_modules = [
|
||||
("test_browser_manager", "Browser Manager Tests"),
|
||||
("test_playwright_strategy", "Playwright Strategy Tests"),
|
||||
("test_cdp_strategy", "CDP Strategy Tests"),
|
||||
("test_builtin_strategy", "Builtin Browser Strategy Tests"),
|
||||
("test_profiles", "Profile Management Tests")
|
||||
]
|
||||
|
||||
# Run each test module
|
||||
timings = {}
|
||||
for module_name, header in test_modules:
|
||||
try:
|
||||
time_taken = await run_test_module(module_name, header)
|
||||
timings[module_name] = time_taken
|
||||
except Exception as e:
|
||||
logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR")
|
||||
|
||||
# Print summary
|
||||
logger.info("\n\nTEST SUMMARY:", tag="SUMMARY")
|
||||
logger.info(f"{'-'*50}", tag="SUMMARY")
|
||||
for module_name, header in test_modules:
|
||||
if module_name in timings:
|
||||
logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY")
|
||||
logger.info(f"{'-'*50}", tag="SUMMARY")
|
||||
total_time = sum(timings.values())
|
||||
logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
17
tests/browser/test_launch_standalone.py
Normal file
17
tests/browser/test_launch_standalone.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from crawl4ai.browser_profiler import BrowserProfiler
|
||||
import asyncio
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test launching a standalone browser
|
||||
async def test_standalone_browser():
|
||||
profiler = BrowserProfiler()
|
||||
cdp_url = await profiler.launch_standalone_browser(
|
||||
browser_type="chromium",
|
||||
user_data_dir="~/.crawl4ai/browser_profile/test-browser-data",
|
||||
debugging_port=9222,
|
||||
headless=False
|
||||
)
|
||||
print(f"CDP URL: {cdp_url}")
|
||||
|
||||
asyncio.run(test_standalone_browser())
|
||||
902
tests/browser/test_parallel_crawling.py
Normal file
902
tests/browser/test_parallel_crawling.py
Normal file
@@ -0,0 +1,902 @@
|
||||
"""
|
||||
Test examples for parallel crawling with the browser module.
|
||||
|
||||
These examples demonstrate the functionality of parallel page creation
|
||||
and serve as functional tests for multi-page crawling performance.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_get_pages_basic():
|
||||
"""Test basic functionality of get_pages method."""
|
||||
logger.info("Testing basic get_pages functionality", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
|
||||
# Request 3 pages
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=3)
|
||||
|
||||
# Verify we got the correct number of pages
|
||||
assert len(pages) == 3, f"Expected 3 pages, got {len(pages)}"
|
||||
|
||||
# Verify each page is valid
|
||||
for i, (page, context) in enumerate(pages):
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
assert title, f"Page {i+1} has no title"
|
||||
|
||||
await manager.close()
|
||||
logger.success("Basic get_pages test completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_parallel_approaches_comparison():
|
||||
"""Compare two parallel crawling approaches:
|
||||
1. Create a page for each URL on-demand (get_page + gather)
|
||||
2. Get all pages upfront with get_pages, then use them (get_pages + gather)
|
||||
"""
|
||||
logger.info("Comparing different parallel crawling approaches", tag="TEST")
|
||||
|
||||
urls = [
|
||||
"https://example.com/page1",
|
||||
"https://crawl4ai.com",
|
||||
"https://kidocode.com",
|
||||
"https://bbc.com",
|
||||
# "https://example.com/page1",
|
||||
# "https://example.com/page2",
|
||||
# "https://example.com/page3",
|
||||
# "https://example.com/page4",
|
||||
]
|
||||
|
||||
browser_config = BrowserConfig(headless=False)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
|
||||
# Approach 1: Create a page for each URL on-demand and run in parallel
|
||||
logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
|
||||
start_time = time.time()
|
||||
|
||||
async def fetch_title_approach1(url):
|
||||
"""Create a new page for each URL, go to the URL, and get title"""
|
||||
crawler_config = CrawlerRunConfig(url=url)
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Run fetch_title_approach1 for each URL in parallel
|
||||
tasks = [fetch_title_approach1(url) for url in urls]
|
||||
approach1_results = await asyncio.gather(*tasks)
|
||||
|
||||
approach1_time = time.time() - start_time
|
||||
logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
|
||||
|
||||
# Approach 2: Get all pages upfront with get_pages, then use them in parallel
|
||||
logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
|
||||
start_time = time.time()
|
||||
|
||||
# Get all pages upfront
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=len(urls))
|
||||
|
||||
async def fetch_title_approach2(page_ctx, url):
|
||||
"""Use a pre-created page to go to URL and get title"""
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Use the pre-created pages to fetch titles in parallel
|
||||
tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in zip(pages, urls)]
|
||||
approach2_results = await asyncio.gather(*tasks)
|
||||
|
||||
approach2_time = time.time() - start_time
|
||||
logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
|
||||
|
||||
# Compare results and performance
|
||||
speedup = approach1_time / approach2_time if approach2_time > 0 else 0
|
||||
if speedup > 1:
|
||||
logger.success(f"Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
|
||||
else:
|
||||
logger.info(f"Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
|
||||
|
||||
# Verify same content was retrieved in both approaches
|
||||
assert len(approach1_results) == len(approach2_results), "Result count mismatch"
|
||||
|
||||
# Sort results for comparison since parallel execution might complete in different order
|
||||
assert sorted(approach1_results) == sorted(approach2_results), "Results content mismatch"
|
||||
|
||||
await manager.close()
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_multi_browser_scaling(num_browsers=3, pages_per_browser=5):
|
||||
"""Test performance with multiple browsers and pages per browser.
|
||||
Compares two approaches:
|
||||
1. On-demand page creation (get_page + gather)
|
||||
2. Pre-created pages (get_pages + gather)
|
||||
"""
|
||||
logger.info(f"Testing multi-browser scaling with {num_browsers} browsers × {pages_per_browser} pages", tag="TEST")
|
||||
|
||||
# Generate test URLs
|
||||
total_pages = num_browsers * pages_per_browser
|
||||
urls = [f"https://example.com/page_{i}" for i in range(total_pages)]
|
||||
|
||||
# Create browser managers
|
||||
managers = []
|
||||
base_port = 9222
|
||||
|
||||
try:
|
||||
# Start all browsers in parallel
|
||||
start_tasks = []
|
||||
for i in range(num_browsers):
|
||||
browser_config = BrowserConfig(
|
||||
headless=True # Using default browser mode like in test_parallel_approaches_comparison
|
||||
)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
start_tasks.append(manager.start())
|
||||
managers.append(manager)
|
||||
|
||||
await asyncio.gather(*start_tasks)
|
||||
|
||||
# Distribute URLs among managers
|
||||
urls_per_manager = {}
|
||||
for i, manager in enumerate(managers):
|
||||
start_idx = i * pages_per_browser
|
||||
end_idx = min(start_idx + pages_per_browser, len(urls))
|
||||
urls_per_manager[manager] = urls[start_idx:end_idx]
|
||||
|
||||
# Approach 1: Create a page for each URL on-demand and run in parallel
|
||||
logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
|
||||
start_time = time.time()
|
||||
|
||||
async def fetch_title_approach1(manager, url):
|
||||
"""Create a new page for the URL, go to the URL, and get title"""
|
||||
crawler_config = CrawlerRunConfig(url=url)
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Run fetch_title_approach1 for each URL in parallel
|
||||
tasks = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
for url in manager_urls:
|
||||
tasks.append(fetch_title_approach1(manager, url))
|
||||
|
||||
approach1_results = await asyncio.gather(*tasks)
|
||||
|
||||
approach1_time = time.time() - start_time
|
||||
logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
|
||||
|
||||
# Approach 2: Get all pages upfront with get_pages, then use them in parallel
|
||||
logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
|
||||
start_time = time.time()
|
||||
|
||||
# Get all pages upfront for each manager
|
||||
all_pages = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=len(manager_urls))
|
||||
all_pages.extend(zip(pages, manager_urls))
|
||||
|
||||
async def fetch_title_approach2(page_ctx, url):
|
||||
"""Use a pre-created page to go to URL and get title"""
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Use the pre-created pages to fetch titles in parallel
|
||||
tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in all_pages]
|
||||
approach2_results = await asyncio.gather(*tasks)
|
||||
|
||||
approach2_time = time.time() - start_time
|
||||
logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
|
||||
|
||||
# Compare results and performance
|
||||
speedup = approach1_time / approach2_time if approach2_time > 0 else 0
|
||||
pages_per_second = total_pages / approach2_time
|
||||
|
||||
# Show a simple summary
|
||||
logger.info(f"📊 Summary: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls", tag="TEST")
|
||||
logger.info(f"⚡ Performance: {pages_per_second:.1f} pages/second ({pages_per_second*60:.0f} pages/minute)", tag="TEST")
|
||||
logger.info(f"🚀 Total crawl time: {approach2_time:.2f} seconds", tag="TEST")
|
||||
|
||||
if speedup > 1:
|
||||
logger.success(f"✅ Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
|
||||
else:
|
||||
logger.info(f"✅ Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
|
||||
|
||||
# Close all managers
|
||||
for manager in managers:
|
||||
await manager.close()
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Clean up
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def grid_search_optimal_configuration(total_urls=50):
|
||||
"""Perform a grid search to find the optimal balance between number of browsers and pages per browser.
|
||||
|
||||
This function tests different combinations of browser count and pages per browser,
|
||||
while keeping the total number of URLs constant. It measures performance metrics
|
||||
for each configuration to find the "sweet spot" that provides the best speed
|
||||
with reasonable memory usage.
|
||||
|
||||
Args:
|
||||
total_urls: Total number of URLs to crawl (default: 50)
|
||||
"""
|
||||
logger.info(f"=== GRID SEARCH FOR OPTIMAL CRAWLING CONFIGURATION ({total_urls} URLs) ===", tag="TEST")
|
||||
|
||||
# Generate test URLs once
|
||||
urls = [f"https://example.com/page_{i}" for i in range(total_urls)]
|
||||
|
||||
# Define grid search configurations
|
||||
# We'll use more flexible approach: test all browser counts from 1 to min(20, total_urls)
|
||||
# and distribute pages evenly (some browsers may have 1 more page than others)
|
||||
configurations = []
|
||||
|
||||
# Maximum number of browsers to test
|
||||
max_browsers_to_test = min(20, total_urls)
|
||||
|
||||
# Try configurations with 1 to max_browsers_to_test browsers
|
||||
for num_browsers in range(1, max_browsers_to_test + 1):
|
||||
base_pages_per_browser = total_urls // num_browsers
|
||||
remainder = total_urls % num_browsers
|
||||
|
||||
# Generate exact page distribution array
|
||||
if remainder > 0:
|
||||
# First 'remainder' browsers get one more page
|
||||
page_distribution = [base_pages_per_browser + 1] * remainder + [base_pages_per_browser] * (num_browsers - remainder)
|
||||
pages_distribution = f"{base_pages_per_browser+1} pages × {remainder} browsers, {base_pages_per_browser} pages × {num_browsers - remainder} browsers"
|
||||
else:
|
||||
# All browsers get the same number of pages
|
||||
page_distribution = [base_pages_per_browser] * num_browsers
|
||||
pages_distribution = f"{base_pages_per_browser} pages × {num_browsers} browsers"
|
||||
|
||||
# Format the distribution as a tuple string like (4, 4, 3, 3)
|
||||
distribution_str = str(tuple(page_distribution))
|
||||
|
||||
configurations.append((num_browsers, base_pages_per_browser, pages_distribution, page_distribution, distribution_str))
|
||||
|
||||
# Track results
|
||||
results = []
|
||||
|
||||
# Test each configuration
|
||||
for num_browsers, pages_per_browser, pages_distribution, page_distribution, distribution_str in configurations:
|
||||
logger.info("-" * 80, tag="TEST")
|
||||
logger.info(f"Testing configuration: {num_browsers} browsers with distribution: {distribution_str}", tag="TEST")
|
||||
logger.info(f"Details: {pages_distribution}", tag="TEST")
|
||||
# Sleep a bit for randomness
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
try:
|
||||
# Import psutil for memory tracking
|
||||
try:
|
||||
import psutil
|
||||
process = psutil.Process()
|
||||
initial_memory = process.memory_info().rss / (1024 * 1024) # MB
|
||||
except ImportError:
|
||||
logger.warning("psutil not available, memory metrics will not be tracked", tag="TEST")
|
||||
initial_memory = 0
|
||||
|
||||
# Create and start browser managers
|
||||
managers = []
|
||||
start_time = time.time()
|
||||
|
||||
# Start all browsers in parallel
|
||||
start_tasks = []
|
||||
for i in range(num_browsers):
|
||||
browser_config = BrowserConfig(
|
||||
headless=True
|
||||
)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
start_tasks.append(manager.start())
|
||||
managers.append(manager)
|
||||
|
||||
await asyncio.gather(*start_tasks)
|
||||
browser_startup_time = time.time() - start_time
|
||||
|
||||
# Measure memory after browser startup
|
||||
if initial_memory > 0:
|
||||
browser_memory = process.memory_info().rss / (1024 * 1024) - initial_memory
|
||||
else:
|
||||
browser_memory = 0
|
||||
|
||||
# Distribute URLs among managers using the exact page distribution
|
||||
urls_per_manager = {}
|
||||
total_assigned = 0
|
||||
|
||||
for i, manager in enumerate(managers):
|
||||
if i < len(page_distribution):
|
||||
# Get the exact number of pages for this browser from our distribution
|
||||
manager_pages = page_distribution[i]
|
||||
|
||||
# Get the URL slice for this manager
|
||||
start_idx = total_assigned
|
||||
end_idx = start_idx + manager_pages
|
||||
urls_per_manager[manager] = urls[start_idx:end_idx]
|
||||
total_assigned += manager_pages
|
||||
else:
|
||||
# If we have more managers than our distribution (should never happen)
|
||||
urls_per_manager[manager] = []
|
||||
|
||||
# Use the more efficient approach (pre-created pages)
|
||||
logger.info("Running page crawling test...", tag="TEST")
|
||||
crawl_start_time = time.time()
|
||||
|
||||
# Get all pages upfront for each manager
|
||||
all_pages = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
if not manager_urls: # Skip managers with no URLs
|
||||
continue
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=len(manager_urls))
|
||||
all_pages.extend(zip(pages, manager_urls))
|
||||
|
||||
# Measure memory after page creation
|
||||
if initial_memory > 0:
|
||||
pages_memory = process.memory_info().rss / (1024 * 1024) - browser_memory - initial_memory
|
||||
else:
|
||||
pages_memory = 0
|
||||
|
||||
# Function to crawl a URL with a pre-created page
|
||||
async def fetch_title(page_ctx, url):
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Use the pre-created pages to fetch titles in parallel
|
||||
tasks = [fetch_title(page_ctx, url) for page_ctx, url in all_pages]
|
||||
crawl_results = await asyncio.gather(*tasks)
|
||||
|
||||
crawl_time = time.time() - crawl_start_time
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Final memory measurement
|
||||
if initial_memory > 0:
|
||||
peak_memory = max(browser_memory + pages_memory, process.memory_info().rss / (1024 * 1024) - initial_memory)
|
||||
else:
|
||||
peak_memory = 0
|
||||
|
||||
# Close all managers
|
||||
for manager in managers:
|
||||
await manager.close()
|
||||
|
||||
# Calculate metrics
|
||||
pages_per_second = total_urls / crawl_time
|
||||
|
||||
# Store result metrics
|
||||
result = {
|
||||
"num_browsers": num_browsers,
|
||||
"pages_per_browser": pages_per_browser,
|
||||
"page_distribution": page_distribution,
|
||||
"distribution_str": distribution_str,
|
||||
"total_urls": total_urls,
|
||||
"browser_startup_time": browser_startup_time,
|
||||
"crawl_time": crawl_time,
|
||||
"total_time": total_time,
|
||||
"browser_memory": browser_memory,
|
||||
"pages_memory": pages_memory,
|
||||
"peak_memory": peak_memory,
|
||||
"pages_per_second": pages_per_second,
|
||||
# Calculate efficiency score (higher is better)
|
||||
# This balances speed vs memory usage
|
||||
"efficiency_score": pages_per_second / (peak_memory + 1) if peak_memory > 0 else pages_per_second,
|
||||
}
|
||||
|
||||
results.append(result)
|
||||
|
||||
# Log the results
|
||||
logger.info(f"Browser startup: {browser_startup_time:.2f}s", tag="TEST")
|
||||
logger.info(f"Crawl time: {crawl_time:.2f}s", tag="TEST")
|
||||
logger.info(f"Total time: {total_time:.2f}s", tag="TEST")
|
||||
logger.info(f"Performance: {pages_per_second:.1f} pages/second", tag="TEST")
|
||||
|
||||
if peak_memory > 0:
|
||||
logger.info(f"Browser memory: {browser_memory:.1f}MB", tag="TEST")
|
||||
logger.info(f"Pages memory: {pages_memory:.1f}MB", tag="TEST")
|
||||
logger.info(f"Peak memory: {peak_memory:.1f}MB", tag="TEST")
|
||||
logger.info(f"Efficiency score: {result['efficiency_score']:.6f}", tag="TEST")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error testing configuration: {str(e)}", tag="TEST")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Clean up
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Print summary of all configurations
|
||||
logger.info("=" * 100, tag="TEST")
|
||||
logger.info("GRID SEARCH RESULTS SUMMARY", tag="TEST")
|
||||
logger.info("=" * 100, tag="TEST")
|
||||
|
||||
# Rank configurations by efficiency score
|
||||
ranked_results = sorted(results, key=lambda x: x["efficiency_score"], reverse=True)
|
||||
|
||||
# Also determine rankings by different metrics
|
||||
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
|
||||
lowest_memory = sorted(results, key=lambda x: x["peak_memory"] if x["peak_memory"] > 0 else float('inf'))[0]
|
||||
most_efficient = ranked_results[0]
|
||||
|
||||
# Print top performers by category
|
||||
logger.info("🏆 TOP PERFORMERS BY CATEGORY:", tag="TEST")
|
||||
logger.info(f"⚡ Fastest: {fastest['num_browsers']} browsers × ~{fastest['pages_per_browser']} pages " +
|
||||
f"({fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/s)", tag="TEST")
|
||||
|
||||
if lowest_memory["peak_memory"] > 0:
|
||||
logger.info(f"💾 Lowest memory: {lowest_memory['num_browsers']} browsers × ~{lowest_memory['pages_per_browser']} pages " +
|
||||
f"({lowest_memory['peak_memory']:.1f}MB)", tag="TEST")
|
||||
|
||||
logger.info(f"🌟 Most efficient: {most_efficient['num_browsers']} browsers × ~{most_efficient['pages_per_browser']} pages " +
|
||||
f"(score: {most_efficient['efficiency_score']:.6f})", tag="TEST")
|
||||
|
||||
# Print result table header
|
||||
logger.info("\n📊 COMPLETE RANKING TABLE (SORTED BY EFFICIENCY SCORE):", tag="TEST")
|
||||
logger.info("-" * 120, tag="TEST")
|
||||
|
||||
# Define table header
|
||||
header = f"{'Rank':<5} | {'Browsers':<8} | {'Distribution':<55} | {'Total Time(s)':<12} | {'Speed(p/s)':<12} | {'Memory(MB)':<12} | {'Efficiency':<10} | {'Notes'}"
|
||||
logger.info(header, tag="TEST")
|
||||
logger.info("-" * 120, tag="TEST")
|
||||
|
||||
# Print each configuration in ranked order
|
||||
for rank, result in enumerate(ranked_results, 1):
|
||||
# Add special notes for top performers
|
||||
notes = []
|
||||
if result == fastest:
|
||||
notes.append("⚡ Fastest")
|
||||
if result == lowest_memory:
|
||||
notes.append("💾 Lowest Memory")
|
||||
if result == most_efficient:
|
||||
notes.append("🌟 Most Efficient")
|
||||
|
||||
notes_str = " | ".join(notes) if notes else ""
|
||||
|
||||
# Format memory if available
|
||||
memory_str = f"{result['peak_memory']:.1f}" if result['peak_memory'] > 0 else "N/A"
|
||||
|
||||
# Get the distribution string
|
||||
dist_str = result.get('distribution_str', str(tuple([result['pages_per_browser']] * result['num_browsers'])))
|
||||
|
||||
# Build the row
|
||||
row = f"{rank:<5} | {result['num_browsers']:<8} | {dist_str:<55} | {result['total_time']:.2f}s{' ':<7} | "
|
||||
row += f"{result['pages_per_second']:.2f}{' ':<6} | {memory_str}{' ':<6} | {result['efficiency_score']:.4f}{' ':<4} | {notes_str}"
|
||||
|
||||
logger.info(row, tag="TEST")
|
||||
|
||||
logger.info("-" * 120, tag="TEST")
|
||||
|
||||
# Generate visualization if matplotlib is available
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
# Extract data for plotting from ranked results
|
||||
browser_counts = [r["num_browsers"] for r in ranked_results]
|
||||
efficiency_scores = [r["efficiency_score"] for r in ranked_results]
|
||||
crawl_times = [r["crawl_time"] for r in ranked_results]
|
||||
total_times = [r["total_time"] for r in ranked_results]
|
||||
|
||||
# Filter results with memory data
|
||||
memory_results = [r for r in ranked_results if r["peak_memory"] > 0]
|
||||
memory_browser_counts = [r["num_browsers"] for r in memory_results]
|
||||
peak_memories = [r["peak_memory"] for r in memory_results]
|
||||
|
||||
# Create figure with clean design
|
||||
plt.figure(figsize=(14, 12), facecolor='white')
|
||||
plt.style.use('ggplot')
|
||||
|
||||
# Create grid for subplots
|
||||
gs = plt.GridSpec(3, 1, height_ratios=[1, 1, 1], hspace=0.3)
|
||||
|
||||
# Plot 1: Efficiency Score (higher is better)
|
||||
ax1 = plt.subplot(gs[0])
|
||||
bar_colors = ['#3498db'] * len(browser_counts)
|
||||
|
||||
# Highlight the most efficient
|
||||
most_efficient_idx = browser_counts.index(most_efficient["num_browsers"])
|
||||
bar_colors[most_efficient_idx] = '#e74c3c' # Red for most efficient
|
||||
|
||||
bars = ax1.bar(range(len(browser_counts)), efficiency_scores, color=bar_colors)
|
||||
ax1.set_xticks(range(len(browser_counts)))
|
||||
ax1.set_xticklabels([f"{bc}" for bc in browser_counts], rotation=45)
|
||||
ax1.set_xlabel('Number of Browsers')
|
||||
ax1.set_ylabel('Efficiency Score (higher is better)')
|
||||
ax1.set_title('Browser Configuration Efficiency (higher is better)')
|
||||
|
||||
# Add value labels on top of bars
|
||||
for bar, score in zip(bars, efficiency_scores):
|
||||
height = bar.get_height()
|
||||
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02*max(efficiency_scores),
|
||||
f'{score:.3f}', ha='center', va='bottom', rotation=90, fontsize=8)
|
||||
|
||||
# Highlight best configuration
|
||||
ax1.text(0.02, 0.90, f"🌟 Most Efficient: {most_efficient['num_browsers']} browsers with ~{most_efficient['pages_per_browser']} pages",
|
||||
transform=ax1.transAxes, fontsize=12, verticalalignment='top',
|
||||
bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.3))
|
||||
|
||||
# Plot 2: Time Performance
|
||||
ax2 = plt.subplot(gs[1])
|
||||
|
||||
# Plot both total time and crawl time
|
||||
ax2.plot(browser_counts, crawl_times, 'bo-', label='Crawl Time (s)', linewidth=2)
|
||||
ax2.plot(browser_counts, total_times, 'go--', label='Total Time (s)', linewidth=2, alpha=0.6)
|
||||
|
||||
# Mark the fastest configuration
|
||||
fastest_idx = browser_counts.index(fastest["num_browsers"])
|
||||
ax2.plot(browser_counts[fastest_idx], crawl_times[fastest_idx], 'ro', ms=10,
|
||||
label=f'Fastest: {fastest["num_browsers"]} browsers')
|
||||
|
||||
ax2.set_xlabel('Number of Browsers')
|
||||
ax2.set_ylabel('Time (seconds)')
|
||||
ax2.set_title(f'Time Performance for {total_urls} URLs by Browser Count')
|
||||
ax2.grid(True, linestyle='--', alpha=0.7)
|
||||
ax2.legend(loc='upper right')
|
||||
|
||||
# Plot pages per second on second y-axis
|
||||
pages_per_second = [total_urls/t for t in crawl_times]
|
||||
ax2_twin = ax2.twinx()
|
||||
ax2_twin.plot(browser_counts, pages_per_second, 'r^--', label='Pages/second', alpha=0.5)
|
||||
ax2_twin.set_ylabel('Pages per second')
|
||||
|
||||
# Add note about the fastest configuration
|
||||
ax2.text(0.02, 0.90, f"⚡ Fastest: {fastest['num_browsers']} browsers with ~{fastest['pages_per_browser']} pages" +
|
||||
f"\n {fastest['crawl_time']:.2f}s ({fastest['pages_per_second']:.1f} pages/s)",
|
||||
transform=ax2.transAxes, fontsize=12, verticalalignment='top',
|
||||
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.3))
|
||||
|
||||
# Plot 3: Memory Usage (if available)
|
||||
if memory_results:
|
||||
ax3 = plt.subplot(gs[2])
|
||||
|
||||
# Prepare data for grouped bar chart
|
||||
memory_per_browser = [m/n for m, n in zip(peak_memories, memory_browser_counts)]
|
||||
memory_per_page = [m/(n*p) for m, n, p in zip(
|
||||
[r["peak_memory"] for r in memory_results],
|
||||
[r["num_browsers"] for r in memory_results],
|
||||
[r["pages_per_browser"] for r in memory_results])]
|
||||
|
||||
x = np.arange(len(memory_browser_counts))
|
||||
width = 0.35
|
||||
|
||||
# Create grouped bars
|
||||
ax3.bar(x - width/2, peak_memories, width, label='Total Memory (MB)', color='#9b59b6')
|
||||
ax3.bar(x + width/2, memory_per_browser, width, label='Memory per Browser (MB)', color='#3498db')
|
||||
|
||||
# Configure axis
|
||||
ax3.set_xticks(x)
|
||||
ax3.set_xticklabels([f"{bc}" for bc in memory_browser_counts], rotation=45)
|
||||
ax3.set_xlabel('Number of Browsers')
|
||||
ax3.set_ylabel('Memory (MB)')
|
||||
ax3.set_title('Memory Usage by Browser Configuration')
|
||||
ax3.legend(loc='upper left')
|
||||
ax3.grid(True, linestyle='--', alpha=0.7)
|
||||
|
||||
# Add second y-axis for memory per page
|
||||
ax3_twin = ax3.twinx()
|
||||
ax3_twin.plot(x, memory_per_page, 'ro-', label='Memory per Page (MB)')
|
||||
ax3_twin.set_ylabel('Memory per Page (MB)')
|
||||
|
||||
# Get lowest memory configuration
|
||||
lowest_memory_idx = memory_browser_counts.index(lowest_memory["num_browsers"])
|
||||
|
||||
# Add note about lowest memory configuration
|
||||
ax3.text(0.02, 0.90, f"💾 Lowest Memory: {lowest_memory['num_browsers']} browsers with ~{lowest_memory['pages_per_browser']} pages" +
|
||||
f"\n {lowest_memory['peak_memory']:.1f}MB ({lowest_memory['peak_memory']/total_urls:.2f}MB per page)",
|
||||
transform=ax3.transAxes, fontsize=12, verticalalignment='top',
|
||||
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgreen', alpha=0.3))
|
||||
|
||||
# Add overall title
|
||||
plt.suptitle(f'Browser Scaling Grid Search Results for {total_urls} URLs', fontsize=16, y=0.98)
|
||||
|
||||
# Add timestamp and info at the bottom
|
||||
plt.figtext(0.5, 0.01, f"Generated by Crawl4AI at {time.strftime('%Y-%m-%d %H:%M:%S')}",
|
||||
ha="center", fontsize=10, style='italic')
|
||||
|
||||
# Get current directory and save the figure there
|
||||
import os
|
||||
__current_file = os.path.abspath(__file__)
|
||||
current_dir = os.path.dirname(__current_file)
|
||||
output_file = os.path.join(current_dir, 'browser_scaling_grid_search.png')
|
||||
|
||||
# Adjust layout and save figure with high DPI
|
||||
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
|
||||
plt.savefig(output_file, dpi=200, bbox_inches='tight')
|
||||
logger.success(f"Visualization saved to {output_file}", tag="TEST")
|
||||
|
||||
except ImportError:
|
||||
logger.warning("matplotlib not available, skipping visualization", tag="TEST")
|
||||
|
||||
return most_efficient["num_browsers"], most_efficient["pages_per_browser"]
|
||||
|
||||
async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
|
||||
"""Find optimal browser configuration for crawling a specific number of URLs.
|
||||
|
||||
Args:
|
||||
total_urls: Number of URLs to crawl
|
||||
verbose: Whether to print progress
|
||||
rate_limit_delay: Delay between page loads to avoid rate limiting
|
||||
|
||||
Returns:
|
||||
dict: Contains fastest, lowest_memory, and optimal configurations
|
||||
"""
|
||||
if verbose:
|
||||
print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
|
||||
|
||||
# Generate test URLs with timestamp to avoid caching
|
||||
timestamp = int(time.time())
|
||||
urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
|
||||
|
||||
# Limit browser configurations to test (1 browser to max 10)
|
||||
max_browsers = min(10, total_urls)
|
||||
configs_to_test = []
|
||||
|
||||
# Generate configurations (browser count, pages distribution)
|
||||
for num_browsers in range(1, max_browsers + 1):
|
||||
base_pages = total_urls // num_browsers
|
||||
remainder = total_urls % num_browsers
|
||||
|
||||
# Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
|
||||
if remainder > 0:
|
||||
distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
|
||||
else:
|
||||
distribution = [base_pages] * num_browsers
|
||||
|
||||
configs_to_test.append((num_browsers, distribution))
|
||||
|
||||
results = []
|
||||
|
||||
# Test each configuration
|
||||
for browser_count, page_distribution in configs_to_test:
|
||||
if verbose:
|
||||
print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
|
||||
|
||||
try:
|
||||
# Track memory if possible
|
||||
try:
|
||||
import psutil
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024) # MB
|
||||
except ImportError:
|
||||
if verbose:
|
||||
print("Memory tracking not available (psutil not installed)")
|
||||
start_memory = 0
|
||||
|
||||
# Start browsers in parallel
|
||||
managers = []
|
||||
start_tasks = []
|
||||
start_time = time.time()
|
||||
|
||||
for i in range(browser_count):
|
||||
config = BrowserConfig(headless=True)
|
||||
manager = BrowserManager(browser_config=config, logger=logger)
|
||||
start_tasks.append(manager.start())
|
||||
managers.append(manager)
|
||||
|
||||
await asyncio.gather(*start_tasks)
|
||||
|
||||
# Distribute URLs among browsers
|
||||
urls_per_manager = {}
|
||||
url_index = 0
|
||||
|
||||
for i, manager in enumerate(managers):
|
||||
pages_for_this_browser = page_distribution[i]
|
||||
end_index = url_index + pages_for_this_browser
|
||||
urls_per_manager[manager] = urls[url_index:end_index]
|
||||
url_index = end_index
|
||||
|
||||
# Create pages for each browser
|
||||
all_pages = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
if not manager_urls:
|
||||
continue
|
||||
pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
|
||||
all_pages.extend(zip(pages, manager_urls))
|
||||
|
||||
# Crawl pages with delay to avoid rate limiting
|
||||
async def crawl_page(page_ctx, url):
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
if rate_limit_delay > 0:
|
||||
await asyncio.sleep(rate_limit_delay)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
crawl_start = time.time()
|
||||
crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
|
||||
await asyncio.gather(*crawl_tasks)
|
||||
crawl_time = time.time() - crawl_start
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Measure final memory usage
|
||||
if start_memory > 0:
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
memory_used = end_memory - start_memory
|
||||
else:
|
||||
memory_used = 0
|
||||
|
||||
# Close all browsers
|
||||
for manager in managers:
|
||||
await manager.close()
|
||||
|
||||
# Calculate metrics
|
||||
pages_per_second = total_urls / crawl_time
|
||||
|
||||
# Calculate efficiency score (higher is better)
|
||||
# This balances speed vs memory
|
||||
if memory_used > 0:
|
||||
efficiency = pages_per_second / (memory_used + 1)
|
||||
else:
|
||||
efficiency = pages_per_second
|
||||
|
||||
# Store result
|
||||
result = {
|
||||
"browser_count": browser_count,
|
||||
"distribution": tuple(page_distribution),
|
||||
"crawl_time": crawl_time,
|
||||
"total_time": total_time,
|
||||
"memory_used": memory_used,
|
||||
"pages_per_second": pages_per_second,
|
||||
"efficiency": efficiency
|
||||
}
|
||||
|
||||
results.append(result)
|
||||
|
||||
if verbose:
|
||||
print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
|
||||
if memory_used > 0:
|
||||
print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
|
||||
print(f" ✓ Efficiency score: {efficiency:.4f}")
|
||||
|
||||
except Exception as e:
|
||||
if verbose:
|
||||
print(f" ✗ Error: {str(e)}")
|
||||
|
||||
# Clean up
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# If no successful results, return None
|
||||
if not results:
|
||||
return None
|
||||
|
||||
# Find best configurations
|
||||
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
|
||||
|
||||
# Only consider memory if available
|
||||
memory_results = [r for r in results if r["memory_used"] > 0]
|
||||
if memory_results:
|
||||
lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
|
||||
else:
|
||||
lowest_memory = fastest
|
||||
|
||||
# Find most efficient (balanced speed vs memory)
|
||||
optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
|
||||
|
||||
# Print summary
|
||||
if verbose:
|
||||
print("\n=== OPTIMAL CONFIGURATIONS ===")
|
||||
print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
|
||||
print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
|
||||
|
||||
print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
|
||||
if lowest_memory["memory_used"] > 0:
|
||||
print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
|
||||
|
||||
print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
|
||||
print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
|
||||
|
||||
return {
|
||||
"fastest": fastest,
|
||||
"lowest_memory": lowest_memory,
|
||||
"optimal": optimal,
|
||||
"all_configs": results
|
||||
}
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
# Find optimal configuration using our utility function
|
||||
configs = await find_optimal_browser_config(
|
||||
total_urls=20, # Use a small number for faster testing
|
||||
verbose=True,
|
||||
rate_limit_delay=0.2 # 200ms delay between page loads to avoid rate limiting
|
||||
)
|
||||
|
||||
if configs:
|
||||
# Show the optimal configuration
|
||||
optimal = configs["optimal"]
|
||||
print(f"\n🎯 Recommended configuration for production use:")
|
||||
print(f" {optimal['browser_count']} browsers with distribution {optimal['distribution']}")
|
||||
print(f" Estimated performance: {optimal['pages_per_second']:.1f} pages/second")
|
||||
results.append(True)
|
||||
else:
|
||||
print("\n❌ Failed to find optimal configuration")
|
||||
results.append(False)
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
print(f"\nTests complete: {passed}/{total} passed")
|
||||
|
||||
if passed == total:
|
||||
print("All tests passed!")
|
||||
else:
|
||||
print(f"{total - passed} tests failed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
316
tests/browser/test_playwright_strategy.py
Normal file
316
tests/browser/test_playwright_strategy.py
Normal file
@@ -0,0 +1,316 @@
|
||||
"""Test examples for PlaywrightBrowserStrategy.
|
||||
|
||||
These examples demonstrate the functionality of PlaywrightBrowserStrategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
|
||||
|
||||
async def test_start_close():
|
||||
# Create browser config for standard Playwright
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=800
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
for _ in range(4):
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
page, context = await manager.get_page(CrawlerRunConfig())
|
||||
logger.info("Got page successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
await asyncio.sleep(1) # Wait for a moment before restarting
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
return True
|
||||
|
||||
async def test_playwright_basic():
|
||||
"""Test basic Playwright browser functionality."""
|
||||
logger.info("Testing standard Playwright browser", tag="TEST")
|
||||
|
||||
# Create browser config for standard Playwright
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=800
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
|
||||
# Get a page
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Got page successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_playwright_text_mode():
|
||||
"""Test Playwright browser in text-only mode."""
|
||||
logger.info("Testing Playwright text mode", tag="TEST")
|
||||
|
||||
# Create browser config with text mode enabled
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
text_mode=True # Enable text-only mode
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully in text mode", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Check if images are blocked in text mode
|
||||
# We'll check if any image requests were made
|
||||
has_images = False
|
||||
async with page.expect_request("**/*.{png,jpg,jpeg,gif,webp,svg}", timeout=1000) as request_info:
|
||||
try:
|
||||
# Try to load a page with images
|
||||
await page.goto("https://picsum.photos/", wait_until="domcontentloaded")
|
||||
request = await request_info.value
|
||||
has_images = True
|
||||
except:
|
||||
# Timeout without image requests means text mode is working
|
||||
has_images = False
|
||||
|
||||
logger.info(f"Text mode image blocking working: {not has_images}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_playwright_context_reuse():
|
||||
"""Test context caching and reuse with identical configurations."""
|
||||
logger.info("Testing context reuse with identical configurations", tag="TEST")
|
||||
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create identical crawler configs
|
||||
crawler_config1 = CrawlerRunConfig(
|
||||
css_selector="body",
|
||||
)
|
||||
|
||||
crawler_config2 = CrawlerRunConfig(
|
||||
css_selector="body",
|
||||
)
|
||||
|
||||
# Get pages with these configs
|
||||
page1, context1 = await manager.get_page(crawler_config1)
|
||||
page2, context2 = await manager.get_page(crawler_config2)
|
||||
|
||||
# Check if contexts are reused
|
||||
is_same_context = context1 == context2
|
||||
logger.info(f"Contexts reused: {is_same_context}", tag="TEST")
|
||||
|
||||
# Now try with a different config
|
||||
crawler_config3 = CrawlerRunConfig()
|
||||
|
||||
page3, context3 = await manager.get_page(crawler_config3)
|
||||
|
||||
# This should be a different context
|
||||
is_different_context = context1 != context3
|
||||
logger.info(f"Different contexts for different configs: {is_different_context}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
# Both tests should pass for success
|
||||
return is_same_context and is_different_context
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_playwright_session_management():
|
||||
"""Test session management with Playwright browser."""
|
||||
logger.info("Testing session management with Playwright browser", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched successfully", tag="TEST")
|
||||
|
||||
# Create two sessions
|
||||
session1_id = "playwright_session_1"
|
||||
session2_id = "playwright_session_2"
|
||||
|
||||
# Set up first session
|
||||
crawler_config1 = CrawlerRunConfig(session_id=session1_id, url="https://example.com")
|
||||
page1, context1 = await manager.get_page(crawler_config1)
|
||||
await page1.goto("https://example.com")
|
||||
await page1.evaluate("localStorage.setItem('playwright_session1_data', 'test_value1')")
|
||||
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
|
||||
|
||||
# Set up second session
|
||||
crawler_config2 = CrawlerRunConfig(session_id=session2_id, url="https://example.org")
|
||||
page2, context2 = await manager.get_page(crawler_config2)
|
||||
await page2.goto("https://example.org")
|
||||
await page2.evaluate("localStorage.setItem('playwright_session2_data', 'test_value2')")
|
||||
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
|
||||
|
||||
# Get first session again
|
||||
page1_again, context1_again = await manager.get_page(crawler_config1)
|
||||
|
||||
# Verify it's the same page and data persists
|
||||
is_same_page = page1 == page1_again
|
||||
is_same_context = context1 == context1_again
|
||||
data1 = await page1_again.evaluate("localStorage.getItem('playwright_session1_data')")
|
||||
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
|
||||
|
||||
# Kill first session
|
||||
await manager.kill_session(session1_id)
|
||||
logger.info(f"Killed session 1", tag="TEST")
|
||||
|
||||
# Verify second session still works
|
||||
data2 = await page2.evaluate("localStorage.getItem('playwright_session2_data')")
|
||||
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return is_same_page and is_same_context and data1 == "test_value1" and data2 == "test_value2"
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
# results.append(await test_start_close())
|
||||
# results.append(await test_playwright_basic())
|
||||
# results.append(await test_playwright_text_mode())
|
||||
# results.append(await test_playwright_context_reuse())
|
||||
results.append(await test_playwright_session_management())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
176
tests/browser/test_profiles.py
Normal file
176
tests/browser/test_profiles.py
Normal file
@@ -0,0 +1,176 @@
|
||||
"""Test examples for BrowserProfileManager.
|
||||
|
||||
These examples demonstrate the functionality of BrowserProfileManager
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
import shutil
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager, BrowserProfileManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_profile_creation():
|
||||
"""Test creating and managing browser profiles."""
|
||||
logger.info("Testing profile creation and management", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
|
||||
try:
|
||||
# List existing profiles
|
||||
profiles = profile_manager.list_profiles()
|
||||
logger.info(f"Found {len(profiles)} existing profiles", tag="TEST")
|
||||
|
||||
# Generate a unique profile name for testing
|
||||
test_profile_name = f"test-profile-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# Create a test profile directory
|
||||
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
|
||||
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
|
||||
|
||||
# Create a dummy Preferences file to simulate a Chrome profile
|
||||
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
|
||||
f.write("{\"test\": true}")
|
||||
|
||||
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
|
||||
|
||||
# Verify the profile is now in the list
|
||||
profiles = profile_manager.list_profiles()
|
||||
profile_found = any(p["name"] == test_profile_name for p in profiles)
|
||||
logger.info(f"Profile found in list: {profile_found}", tag="TEST")
|
||||
|
||||
# Try to get the profile path
|
||||
retrieved_path = profile_manager.get_profile_path(test_profile_name)
|
||||
path_match = retrieved_path == profile_path
|
||||
logger.info(f"Retrieved correct profile path: {path_match}", tag="TEST")
|
||||
|
||||
# Delete the profile
|
||||
success = profile_manager.delete_profile(test_profile_name)
|
||||
logger.info(f"Profile deletion successful: {success}", tag="TEST")
|
||||
|
||||
# Verify it's gone
|
||||
profiles_after = profile_manager.list_profiles()
|
||||
profile_removed = not any(p["name"] == test_profile_name for p in profiles_after)
|
||||
logger.info(f"Profile removed from list: {profile_removed}", tag="TEST")
|
||||
|
||||
# Clean up just in case
|
||||
if os.path.exists(profile_path):
|
||||
shutil.rmtree(profile_path, ignore_errors=True)
|
||||
|
||||
return profile_found and path_match and success and profile_removed
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Clean up test directory
|
||||
try:
|
||||
if os.path.exists(profile_path):
|
||||
shutil.rmtree(profile_path, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_profile_with_browser():
|
||||
"""Test using a profile with a browser."""
|
||||
logger.info("Testing using a profile with a browser", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
|
||||
profile_path = None
|
||||
|
||||
try:
|
||||
# Create a test profile directory
|
||||
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
|
||||
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
|
||||
|
||||
# Create a dummy Preferences file to simulate a Chrome profile
|
||||
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
|
||||
f.write("{\"test\": true}")
|
||||
|
||||
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
|
||||
|
||||
# Now use this profile with a browser
|
||||
browser_config = BrowserConfig(
|
||||
user_data_dir=profile_path,
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser with the profile
|
||||
await manager.start()
|
||||
logger.info("Browser started with profile", tag="TEST")
|
||||
|
||||
# Create a page
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Navigate and set some data to verify profile works
|
||||
await page.goto("https://example.com")
|
||||
await page.evaluate("localStorage.setItem('test_data', 'profile_value')")
|
||||
|
||||
# Close browser
|
||||
await manager.close()
|
||||
logger.info("First browser session closed", tag="TEST")
|
||||
|
||||
# Create a new browser with the same profile
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
await manager2.start()
|
||||
logger.info("Second browser session started with same profile", tag="TEST")
|
||||
|
||||
# Get a page and check if the data persists
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
await page2.goto("https://example.com")
|
||||
data = await page2.evaluate("localStorage.getItem('test_data')")
|
||||
|
||||
# Verify data persisted
|
||||
data_persisted = data == "profile_value"
|
||||
logger.info(f"Data persisted across sessions: {data_persisted}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager2.close()
|
||||
logger.info("Second browser session closed", tag="TEST")
|
||||
|
||||
# Delete the test profile
|
||||
success = profile_manager.delete_profile(test_profile_name)
|
||||
logger.info(f"Test profile deleted: {success}", tag="TEST")
|
||||
|
||||
return data_persisted and success
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Clean up
|
||||
try:
|
||||
if profile_path and os.path.exists(profile_path):
|
||||
shutil.rmtree(profile_path, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_profile_creation())
|
||||
results.append(await test_profile_with_browser())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
133
tests/cli/test_cli.py
Normal file
133
tests/cli/test_cli.py
Normal file
@@ -0,0 +1,133 @@
|
||||
import pytest
|
||||
from click.testing import CliRunner
|
||||
from pathlib import Path
|
||||
import json
|
||||
import yaml
|
||||
from crawl4ai.cli import cli, load_config_file, parse_key_values
|
||||
import tempfile
|
||||
import os
|
||||
import click
|
||||
|
||||
@pytest.fixture
|
||||
def runner():
|
||||
return CliRunner()
|
||||
|
||||
@pytest.fixture
|
||||
def temp_config_dir():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
old_home = os.environ.get('HOME')
|
||||
os.environ['HOME'] = tmpdir
|
||||
yield Path(tmpdir)
|
||||
if old_home:
|
||||
os.environ['HOME'] = old_home
|
||||
|
||||
@pytest.fixture
|
||||
def sample_configs(temp_config_dir):
|
||||
configs = {
|
||||
'browser.yml': {
|
||||
'headless': True,
|
||||
'viewport_width': 1280,
|
||||
'user_agent_mode': 'random'
|
||||
},
|
||||
'crawler.yml': {
|
||||
'cache_mode': 'bypass',
|
||||
'wait_until': 'networkidle',
|
||||
'scan_full_page': True
|
||||
},
|
||||
'extract_css.yml': {
|
||||
'type': 'json-css',
|
||||
'params': {'verbose': True}
|
||||
},
|
||||
'css_schema.json': {
|
||||
'name': 'ArticleExtractor',
|
||||
'baseSelector': '.article',
|
||||
'fields': [
|
||||
{'name': 'title', 'selector': 'h1.title', 'type': 'text'},
|
||||
{'name': 'link', 'selector': 'a.read-more', 'type': 'attribute', 'attribute': 'href'}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
for filename, content in configs.items():
|
||||
path = temp_config_dir / filename
|
||||
with open(path, 'w') as f:
|
||||
if filename.endswith('.yml'):
|
||||
yaml.dump(content, f)
|
||||
else:
|
||||
json.dump(content, f)
|
||||
|
||||
return {name: str(temp_config_dir / name) for name in configs}
|
||||
|
||||
class TestCLIBasics:
|
||||
def test_help(self, runner):
|
||||
result = runner.invoke(cli, ['--help'])
|
||||
assert result.exit_code == 0
|
||||
assert 'Crawl4AI CLI' in result.output
|
||||
|
||||
def test_examples(self, runner):
|
||||
result = runner.invoke(cli, ['--example'])
|
||||
assert result.exit_code == 0
|
||||
assert 'Examples' in result.output
|
||||
|
||||
def test_missing_url(self, runner):
|
||||
result = runner.invoke(cli)
|
||||
assert result.exit_code != 0
|
||||
assert 'URL argument is required' in result.output
|
||||
|
||||
class TestConfigParsing:
|
||||
def test_parse_key_values_basic(self):
|
||||
result = parse_key_values(None, None, "key1=value1,key2=true")
|
||||
assert result == {'key1': 'value1', 'key2': True}
|
||||
|
||||
def test_parse_key_values_invalid(self):
|
||||
with pytest.raises(click.BadParameter):
|
||||
parse_key_values(None, None, "invalid_format")
|
||||
|
||||
class TestConfigLoading:
|
||||
def test_load_yaml_config(self, sample_configs):
|
||||
config = load_config_file(sample_configs['browser.yml'])
|
||||
assert config['headless'] is True
|
||||
assert config['viewport_width'] == 1280
|
||||
|
||||
def test_load_json_config(self, sample_configs):
|
||||
config = load_config_file(sample_configs['css_schema.json'])
|
||||
assert config['name'] == 'ArticleExtractor'
|
||||
assert len(config['fields']) == 2
|
||||
|
||||
def test_load_nonexistent_config(self):
|
||||
with pytest.raises(click.BadParameter):
|
||||
load_config_file('nonexistent.yml')
|
||||
|
||||
class TestLLMConfig:
|
||||
def test_llm_config_creation(self, temp_config_dir, runner):
|
||||
def input_simulation(inputs):
|
||||
return runner.invoke(cli, ['https://example.com', '-q', 'test question'],
|
||||
input='\n'.join(inputs))
|
||||
|
||||
class TestCrawlingFeatures:
|
||||
def test_basic_crawl(self, runner):
|
||||
result = runner.invoke(cli, ['https://example.com'])
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
class TestErrorHandling:
|
||||
def test_invalid_config_file(self, runner):
|
||||
result = runner.invoke(cli, [
|
||||
'https://example.com',
|
||||
'--browser-config', 'nonexistent.yml'
|
||||
])
|
||||
assert result.exit_code != 0
|
||||
|
||||
def test_invalid_schema(self, runner, temp_config_dir):
|
||||
invalid_schema = temp_config_dir / 'invalid_schema.json'
|
||||
with open(invalid_schema, 'w') as f:
|
||||
f.write('invalid json')
|
||||
|
||||
result = runner.invoke(cli, [
|
||||
'https://example.com',
|
||||
'--schema', str(invalid_schema)
|
||||
])
|
||||
assert result.exit_code != 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main(['-v', '-s', '--tb=native', __file__])
|
||||
113
tests/docker/test_config_object.py
Normal file
113
tests/docker/test_config_object.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import json
|
||||
from crawl4ai import (
|
||||
CrawlerRunConfig,
|
||||
DefaultMarkdownGenerator,
|
||||
RegexChunking,
|
||||
JsonCssExtractionStrategy,
|
||||
BM25ContentFilter,
|
||||
CacheMode
|
||||
)
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
||||
from crawl4ai.deep_crawling.filters import FastFilterChain
|
||||
from crawl4ai.deep_crawling.filters import FastContentTypeFilter, FastDomainFilter
|
||||
from crawl4ai.deep_crawling.scorers import FastKeywordRelevanceScorer
|
||||
|
||||
def create_test_config() -> CrawlerRunConfig:
|
||||
# Set up content filtering and markdown generation
|
||||
content_filter = BM25ContentFilter(
|
||||
user_query="technology articles",
|
||||
)
|
||||
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=content_filter,
|
||||
options={"ignore_links": False, "body_width": 0}
|
||||
)
|
||||
|
||||
# Set up extraction strategy
|
||||
extraction_schema = {
|
||||
"name": "ArticleExtractor",
|
||||
"baseSelector": "article.content",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1", "type": "text"},
|
||||
{"name": "content", "selector": ".article-body", "type": "html"}
|
||||
]
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema=extraction_schema)
|
||||
|
||||
# Set up deep crawling
|
||||
filter_chain = FastFilterChain([
|
||||
FastContentTypeFilter(["text/html"]),
|
||||
FastDomainFilter(blocked_domains=["ads.*"])
|
||||
])
|
||||
|
||||
url_scorer = FastKeywordRelevanceScorer(
|
||||
keywords=["article", "blog"],
|
||||
weight=1.0
|
||||
)
|
||||
|
||||
deep_crawl_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=url_scorer
|
||||
)
|
||||
|
||||
# Create the config
|
||||
config = CrawlerRunConfig(
|
||||
word_count_threshold=200,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=RegexChunking(patterns=[r"\n\n"]),
|
||||
markdown_generator=markdown_generator,
|
||||
css_selector="main.content",
|
||||
excluded_tags=["nav", "footer"],
|
||||
keep_attrs=["href", "src"],
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_until="networkidle",
|
||||
page_timeout=30000,
|
||||
scan_full_page=True,
|
||||
deep_crawl_strategy=deep_crawl_strategy,
|
||||
verbose=True,
|
||||
stream=True
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
def test_config_serialization_cycle():
|
||||
# Create original config
|
||||
original_config = create_test_config()
|
||||
|
||||
# Dump to serializable dictionary
|
||||
serialized = original_config.dump()
|
||||
|
||||
print(json.dumps(serialized, indent=2))
|
||||
|
||||
# Load back into config object
|
||||
deserialized_config = CrawlerRunConfig.load(serialized)
|
||||
|
||||
# Verify core attributes
|
||||
assert deserialized_config.word_count_threshold == original_config.word_count_threshold
|
||||
assert deserialized_config.css_selector == original_config.css_selector
|
||||
assert deserialized_config.excluded_tags == original_config.excluded_tags
|
||||
assert deserialized_config.keep_attrs == original_config.keep_attrs
|
||||
assert deserialized_config.cache_mode == original_config.cache_mode
|
||||
assert deserialized_config.wait_until == original_config.wait_until
|
||||
assert deserialized_config.page_timeout == original_config.page_timeout
|
||||
assert deserialized_config.scan_full_page == original_config.scan_full_page
|
||||
assert deserialized_config.verbose == original_config.verbose
|
||||
assert deserialized_config.stream == original_config.stream
|
||||
|
||||
# Verify complex objects
|
||||
assert isinstance(deserialized_config.extraction_strategy, JsonCssExtractionStrategy)
|
||||
assert isinstance(deserialized_config.chunking_strategy, RegexChunking)
|
||||
assert isinstance(deserialized_config.markdown_generator, DefaultMarkdownGenerator)
|
||||
assert isinstance(deserialized_config.markdown_generator.content_filter, BM25ContentFilter)
|
||||
assert isinstance(deserialized_config.deep_crawl_strategy, BFSDeepCrawlStrategy)
|
||||
|
||||
# Verify deep crawl strategy configuration
|
||||
assert deserialized_config.deep_crawl_strategy.max_depth == 3
|
||||
assert isinstance(deserialized_config.deep_crawl_strategy.filter_chain, FastFilterChain)
|
||||
assert isinstance(deserialized_config.deep_crawl_strategy.url_scorer, FastKeywordRelevanceScorer)
|
||||
|
||||
print("Serialization cycle test passed successfully!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_config_serialization_cycle()
|
||||
175
tests/docker/test_docker.py
Normal file
175
tests/docker/test_docker.py
Normal file
@@ -0,0 +1,175 @@
|
||||
import requests
|
||||
import time
|
||||
import httpx
|
||||
import asyncio
|
||||
from typing import Dict, Any
|
||||
from crawl4ai import (
|
||||
BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
|
||||
PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
|
||||
)
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
|
||||
class Crawl4AiTester:
|
||||
def __init__(self, base_url: str = "http://localhost:11235"):
|
||||
self.base_url = base_url
|
||||
|
||||
def submit_and_wait(
|
||||
self, request_data: Dict[str, Any], timeout: int = 300
|
||||
) -> Dict[str, Any]:
|
||||
# Submit crawl job
|
||||
response = requests.post(f"{self.base_url}/crawl", json=request_data)
|
||||
task_id = response.json()["task_id"]
|
||||
print(f"Task ID: {task_id}")
|
||||
|
||||
# Poll for result
|
||||
start_time = time.time()
|
||||
while True:
|
||||
if time.time() - start_time > timeout:
|
||||
raise TimeoutError(
|
||||
f"Task {task_id} did not complete within {timeout} seconds"
|
||||
)
|
||||
|
||||
result = requests.get(f"{self.base_url}/task/{task_id}")
|
||||
status = result.json()
|
||||
|
||||
if status["status"] == "failed":
|
||||
print("Task failed:", status.get("error"))
|
||||
raise Exception(f"Task failed: {status.get('error')}")
|
||||
|
||||
if status["status"] == "completed":
|
||||
return status
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
async def test_direct_api():
|
||||
"""Test direct API endpoints without using the client SDK"""
|
||||
print("\n=== Testing Direct API Calls ===")
|
||||
|
||||
# Test 1: Basic crawl with content filtering
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
viewport_width=1200,
|
||||
viewport_height=800
|
||||
)
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48,
|
||||
threshold_type="fixed",
|
||||
min_word_threshold=0
|
||||
),
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
)
|
||||
|
||||
request_data = {
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": browser_config.dump(),
|
||||
"crawler_config": crawler_config.dump()
|
||||
}
|
||||
|
||||
# Make direct API call
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
"http://localhost:8000/crawl",
|
||||
json=request_data,
|
||||
timeout=300
|
||||
)
|
||||
assert response.status_code == 200
|
||||
result = response.json()
|
||||
print("Basic crawl result:", result["success"])
|
||||
|
||||
# Test 2: Structured extraction with JSON CSS
|
||||
schema = {
|
||||
"baseSelector": "article.post",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1", "type": "text"},
|
||||
{"name": "content", "selector": ".content", "type": "html"}
|
||||
]
|
||||
}
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema=schema)
|
||||
)
|
||||
|
||||
request_data["crawler_config"] = crawler_config.dump()
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
"http://localhost:8000/crawl",
|
||||
json=request_data
|
||||
)
|
||||
assert response.status_code == 200
|
||||
result = response.json()
|
||||
print("Structured extraction result:", result["success"])
|
||||
|
||||
# Test 3: Get schema
|
||||
# async with httpx.AsyncClient() as client:
|
||||
# response = await client.get("http://localhost:8000/schema")
|
||||
# assert response.status_code == 200
|
||||
# schemas = response.json()
|
||||
# print("Retrieved schemas for:", list(schemas.keys()))
|
||||
|
||||
async def test_with_client():
|
||||
"""Test using the Crawl4AI Docker client SDK"""
|
||||
print("\n=== Testing Client SDK ===")
|
||||
|
||||
async with Crawl4aiDockerClient(verbose=True) as client:
|
||||
# Test 1: Basic crawl
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48,
|
||||
threshold_type="fixed"
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = await client.crawl(
|
||||
urls=["https://example.com"],
|
||||
browser_config=browser_config,
|
||||
crawler_config=crawler_config
|
||||
)
|
||||
print("Client SDK basic crawl:", result.success)
|
||||
|
||||
# Test 2: LLM extraction with streaming
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(
|
||||
llm_config=LLMConfig(provider="openai/gpt-40"),
|
||||
instruction="Extract key technical concepts"
|
||||
)
|
||||
),
|
||||
stream=True
|
||||
)
|
||||
|
||||
async for result in await client.crawl(
|
||||
urls=["https://example.com"],
|
||||
browser_config=browser_config,
|
||||
crawler_config=crawler_config
|
||||
):
|
||||
print(f"Streaming result for: {result.url}")
|
||||
|
||||
# # Test 3: Get schema
|
||||
# schemas = await client.get_schema()
|
||||
# print("Retrieved client schemas for:", list(schemas.keys()))
|
||||
|
||||
async def main():
|
||||
"""Run all tests"""
|
||||
# Test direct API
|
||||
print("Testing direct API calls...")
|
||||
await test_direct_api()
|
||||
|
||||
# Test client SDK
|
||||
print("\nTesting client SDK...")
|
||||
await test_with_client()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
34
tests/docker/test_dockerclient.py
Normal file
34
tests/docker/test_dockerclient.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import asyncio
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
from crawl4ai import (
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig
|
||||
)
|
||||
|
||||
async def main():
|
||||
async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
|
||||
await client.authenticate("test@example.com")
|
||||
|
||||
# Non-streaming crawl
|
||||
results = await client.crawl(
|
||||
["https://example.com", "https://python.org"],
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=CrawlerRunConfig()
|
||||
)
|
||||
print(f"Non-streaming results: {results}")
|
||||
|
||||
# Streaming crawl
|
||||
crawler_config = CrawlerRunConfig(stream=True)
|
||||
async for result in await client.crawl(
|
||||
["https://example.com", "https://python.org"],
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=crawler_config
|
||||
):
|
||||
print(f"Streamed result: {result}")
|
||||
|
||||
# Get schema
|
||||
schema = await client.get_schema()
|
||||
print(f"Schema: {schema}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
596
tests/docker/test_rest_api_deep_crawl.py
Normal file
596
tests/docker/test_rest_api_deep_crawl.py
Normal file
@@ -0,0 +1,596 @@
|
||||
# ==== File: test_rest_api_deep_crawl.py ====
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import httpx
|
||||
import json
|
||||
import asyncio
|
||||
import os
|
||||
from typing import List, Dict, Any, AsyncGenerator
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv() # Load environment variables from .env file if present
|
||||
|
||||
# --- Test Configuration ---
|
||||
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP
|
||||
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode
|
||||
DEEP_CRAWL_BASE_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
|
||||
DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # Used for domain filter
|
||||
|
||||
# --- Helper Functions ---
|
||||
def load_proxies_from_env() -> List[Dict]:
|
||||
"""Load proxies from PROXIES environment variable"""
|
||||
proxies = []
|
||||
proxies_str = os.getenv("PROXIES", "")
|
||||
if not proxies_str:
|
||||
print("PROXIES environment variable not set or empty.")
|
||||
return proxies
|
||||
try:
|
||||
proxy_list = proxies_str.split(",")
|
||||
for proxy in proxy_list:
|
||||
proxy = proxy.strip()
|
||||
if not proxy:
|
||||
continue
|
||||
parts = proxy.split(":")
|
||||
if len(parts) == 4:
|
||||
ip, port, username, password = parts
|
||||
proxies.append({
|
||||
"server": f"http://{ip}:{port}", # Assuming http, adjust if needed
|
||||
"username": username,
|
||||
"password": password,
|
||||
"ip": ip # Store original IP if available
|
||||
})
|
||||
elif len(parts) == 2: # ip:port only
|
||||
ip, port = parts
|
||||
proxies.append({
|
||||
"server": f"http://{ip}:{port}",
|
||||
"ip": ip
|
||||
})
|
||||
else:
|
||||
print(f"Skipping invalid proxy string format: {proxy}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading proxies from environment: {e}")
|
||||
return proxies
|
||||
|
||||
|
||||
async def check_server_health(client: httpx.AsyncClient):
|
||||
"""Check if the server is healthy before running tests."""
|
||||
try:
|
||||
response = await client.get("/health")
|
||||
response.raise_for_status()
|
||||
print(f"\nServer healthy: {response.json()}")
|
||||
return True
|
||||
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
||||
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
|
||||
|
||||
async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False):
|
||||
"""Asserts the basic structure of a single crawl result."""
|
||||
assert isinstance(result, dict)
|
||||
assert "url" in result
|
||||
assert "success" in result
|
||||
assert "html" in result # Basic crawls should return HTML
|
||||
assert "metadata" in result
|
||||
assert isinstance(result["metadata"], dict)
|
||||
assert "depth" in result["metadata"] # Deep crawls add depth
|
||||
|
||||
if check_ssl:
|
||||
assert "ssl_certificate" in result # Check if SSL info is present
|
||||
assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None
|
||||
|
||||
|
||||
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
|
||||
"""Processes an NDJSON streaming response."""
|
||||
results = []
|
||||
completed = False
|
||||
async for line in response.aiter_lines():
|
||||
if line:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get("status") == "completed":
|
||||
completed = True
|
||||
break # Stop processing after completion marker
|
||||
elif data.get("url"): # Ensure it looks like a result object
|
||||
results.append(data)
|
||||
else:
|
||||
print(f"Received non-result JSON line: {data}") # Log other status messages if needed
|
||||
except json.JSONDecodeError:
|
||||
pytest.fail(f"Failed to decode JSON line: {line}")
|
||||
assert completed, "Streaming response did not end with a completion marker."
|
||||
return results
|
||||
|
||||
|
||||
# --- Pytest Fixtures ---
|
||||
@pytest_asyncio.fixture(scope="function")
|
||||
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
|
||||
"""Provides an async HTTP client"""
|
||||
# Increased timeout for potentially longer deep crawls
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
|
||||
yield client
|
||||
# No explicit close needed with 'async with'
|
||||
|
||||
# --- Test Class ---
|
||||
@pytest.mark.asyncio
|
||||
class TestDeepCrawlEndpoints:
|
||||
|
||||
@pytest_asyncio.fixture(autouse=True)
|
||||
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
|
||||
"""Fixture to ensure server is healthy before each test in the class."""
|
||||
await check_server_health(async_client)
|
||||
|
||||
# 1. Basic Deep Crawl
|
||||
async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl with limited depth and pages."""
|
||||
max_depth = 1
|
||||
max_pages = 3 # start_url + 2 more
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS", # Use string value for CacheMode
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
# Minimal filters for basic test
|
||||
"filter_chain": {
|
||||
"type": "FilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{
|
||||
"type": "DomainFilter",
|
||||
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
assert data["success"] is True
|
||||
assert isinstance(data["results"], list)
|
||||
assert len(data["results"]) > 1 # Should be more than just the start URL
|
||||
assert len(data["results"]) <= max_pages # Respect max_pages
|
||||
|
||||
found_depth_0 = False
|
||||
found_depth_1 = False
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert DEEP_CRAWL_DOMAIN in result["url"]
|
||||
depth = result["metadata"]["depth"]
|
||||
assert depth <= max_depth
|
||||
if depth == 0: found_depth_0 = True
|
||||
if depth == 1: found_depth_1 = True
|
||||
|
||||
assert found_depth_0
|
||||
assert found_depth_1
|
||||
|
||||
# 2. Deep Crawl with Filtering
|
||||
async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl with content type and domain filters."""
|
||||
max_depth = 1
|
||||
max_pages = 5
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
"filter_chain": {
|
||||
"type": "FilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{
|
||||
"type": "DomainFilter",
|
||||
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
|
||||
},
|
||||
{
|
||||
"type": "ContentTypeFilter",
|
||||
"params": {"allowed_types": ["text/html"]}
|
||||
},
|
||||
# Example: Exclude specific paths using regex
|
||||
{
|
||||
"type": "URLPatternFilter",
|
||||
"params": {
|
||||
"patterns": ["*/category-3/*"], # Block category 3
|
||||
"reverse": True # Block if match
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) > 0
|
||||
assert len(data["results"]) <= max_pages
|
||||
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert DEEP_CRAWL_DOMAIN in result["url"]
|
||||
assert "category-3" not in result["url"] # Check if filter worked
|
||||
assert result["metadata"]["depth"] <= max_depth
|
||||
|
||||
# 3. Deep Crawl with Scoring
|
||||
async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl with URL scoring."""
|
||||
max_depth = 1
|
||||
max_pages = 4
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
"filter_chain": { # Keep basic domain filter
|
||||
"type": "FilterChain",
|
||||
"params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
|
||||
},
|
||||
"url_scorer": { # Add scorer
|
||||
"type": "CompositeScorer",
|
||||
"params": {
|
||||
"scorers": [
|
||||
{ # Favor pages with 'product' in the URL
|
||||
"type": "KeywordRelevanceScorer",
|
||||
"params": {"keywords": ["product"], "weight": 1.0}
|
||||
},
|
||||
{ # Penalize deep paths slightly
|
||||
"type": "PathDepthScorer",
|
||||
"params": {"optimal_depth": 2, "weight": -0.2}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
# Set a threshold if needed: "score_threshold": 0.1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) > 0
|
||||
assert len(data["results"]) <= max_pages
|
||||
|
||||
# Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
|
||||
product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
|
||||
print(f"Product URLs found among depth > 0 results: {product_urls_found}")
|
||||
# We expect scoring to prioritize product pages if available within limits
|
||||
# assert product_urls_found # This might be too strict depending on site structure and limits
|
||||
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert result["metadata"]["depth"] <= max_depth
|
||||
|
||||
# 4. Deep Crawl with CSS Extraction
|
||||
async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl combined with JsonCssExtractionStrategy."""
|
||||
max_depth = 6 # Go deep enough to reach product pages
|
||||
max_pages = 20
|
||||
# Schema to extract product details
|
||||
product_schema = {
|
||||
"name": "ProductDetails",
|
||||
"baseSelector": "div.container", # Base for product page
|
||||
"fields": [
|
||||
{"name": "product_title", "selector": "h1", "type": "text"},
|
||||
{"name": "price", "selector": ".product-price", "type": "text"},
|
||||
{"name": "description", "selector": ".product-description p", "type": "text"},
|
||||
{"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
|
||||
{"name": "spec_name", "selector": ".spec-name", "type": "text"},
|
||||
{"name": "spec_value", "selector": ".spec-value", "type": "text"}
|
||||
]}
|
||||
]
|
||||
}
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"extraction_strategy": { # Apply extraction to ALL crawled pages
|
||||
"type": "JsonCssExtractionStrategy",
|
||||
"params": {"schema": {"type": "dict", "value": product_schema}}
|
||||
},
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
"filter_chain": { # Only crawl HTML on our domain
|
||||
"type": "FilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
||||
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
||||
]
|
||||
}
|
||||
}
|
||||
# Optional: Add scoring to prioritize product pages for extraction
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) > 0
|
||||
# assert len(data["results"]) <= max_pages
|
||||
|
||||
found_extracted_product = False
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert "extracted_content" in result
|
||||
if "product_" in result["url"]: # Check product pages specifically
|
||||
assert result["extracted_content"] is not None
|
||||
try:
|
||||
extracted = json.loads(result["extracted_content"])
|
||||
# Schema returns list even if one base match
|
||||
assert isinstance(extracted, list)
|
||||
if extracted:
|
||||
item = extracted[0]
|
||||
assert "product_title" in item and item["product_title"]
|
||||
assert "price" in item and item["price"]
|
||||
# Specs might be empty list if not found
|
||||
assert "specs" in item and isinstance(item["specs"], list)
|
||||
found_extracted_product = True
|
||||
print(f"Extracted product: {item.get('product_title')}")
|
||||
except (json.JSONDecodeError, AssertionError, IndexError) as e:
|
||||
pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
|
||||
# else:
|
||||
# # Non-product pages might have None or empty list depending on schema match
|
||||
# assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
|
||||
|
||||
assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
|
||||
|
||||
# 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
|
||||
async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl combined with LLMExtractionStrategy."""
|
||||
max_depth = 1 # Limit depth to keep LLM calls manageable
|
||||
max_pages = 3
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"extraction_strategy": { # Apply LLM extraction to crawled pages
|
||||
"type": "LLMExtractionStrategy",
|
||||
"params": {
|
||||
"instruction": "Extract the main H1 title and the text content of the first paragraph.",
|
||||
"llm_config": { # Example override, rely on server default if possible
|
||||
"type": "LLMConfig",
|
||||
"params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
|
||||
},
|
||||
"schema": { # Expected JSON output
|
||||
"type": "dict",
|
||||
"value": {
|
||||
"title": "PageContent", "type": "object",
|
||||
"properties": {
|
||||
"h1_title": {"type": "string"},
|
||||
"first_paragraph": {"type": "string"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
"filter_chain": {
|
||||
"type": "FilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
||||
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
|
||||
except httpx.RequestError as e:
|
||||
pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
|
||||
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) > 0
|
||||
assert len(data["results"]) <= max_pages
|
||||
|
||||
found_llm_extraction = False
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert "extracted_content" in result
|
||||
assert result["extracted_content"] is not None
|
||||
try:
|
||||
extracted = json.loads(result["extracted_content"])
|
||||
if isinstance(extracted, list): extracted = extracted[0] # Handle list output
|
||||
assert isinstance(extracted, dict)
|
||||
assert "h1_title" in extracted # Check keys based on schema
|
||||
assert "first_paragraph" in extracted
|
||||
found_llm_extraction = True
|
||||
print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
|
||||
except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
|
||||
pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
|
||||
|
||||
assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
|
||||
|
||||
|
||||
# 6. Deep Crawl with SSL Certificate Fetching
|
||||
async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl with fetch_ssl_certificate enabled."""
|
||||
max_depth = 0 # Only fetch for start URL to keep test fast
|
||||
max_pages = 1
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"fetch_ssl_certificate": True, # <-- Enable SSL fetching
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) == 1
|
||||
result = data["results"][0]
|
||||
|
||||
await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
|
||||
assert result["success"] is True
|
||||
# Check if SSL info was actually retrieved
|
||||
if result["ssl_certificate"]:
|
||||
# Assert directly using dictionary keys
|
||||
assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
|
||||
assert "issuer" in result["ssl_certificate"]
|
||||
assert "subject" in result["ssl_certificate"]
|
||||
# --- MODIFIED ASSERTIONS ---
|
||||
assert "not_before" in result["ssl_certificate"] # Check for the actual key
|
||||
assert "not_after" in result["ssl_certificate"] # Check for the actual key
|
||||
# --- END MODIFICATIONS ---
|
||||
assert "fingerprint" in result["ssl_certificate"] # Check another key
|
||||
|
||||
# This print statement using .get() already works correctly with dictionaries
|
||||
print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
|
||||
print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
|
||||
else:
|
||||
# This part remains the same
|
||||
print("SSL Certificate was null in the result.")
|
||||
|
||||
|
||||
# 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)
|
||||
async def test_deep_crawl_with_proxies(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl using proxy rotation."""
|
||||
proxies = load_proxies_from_env()
|
||||
if not proxies:
|
||||
pytest.skip("Skipping proxy test: PROXIES environment variable not set or empty.")
|
||||
|
||||
print(f"\nTesting with {len(proxies)} proxies loaded from environment.")
|
||||
|
||||
max_depth = 1
|
||||
max_pages = 3
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL], # Use the dummy site
|
||||
# Use a BrowserConfig that *might* pick up proxy if set, but rely on CrawlerRunConfig
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"proxy_rotation_strategy": { # <-- Define the strategy
|
||||
"type": "RoundRobinProxyStrategy",
|
||||
"params": {
|
||||
# Convert ProxyConfig dicts back to the serialized format expected by server
|
||||
"proxies": [{"type": "ProxyConfig", "params": p} for p in proxies]
|
||||
}
|
||||
},
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
"filter_chain": {
|
||||
"type": "FilterChain",
|
||||
"params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
# Proxies often cause connection errors, catch them
|
||||
pytest.fail(f"Proxy deep crawl failed: {e}. Response: {e.response.text}. Are proxies valid and accessible by the server?")
|
||||
except httpx.RequestError as e:
|
||||
pytest.fail(f"Proxy deep crawl request failed: {e}. Are proxies valid and accessible?")
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) > 0
|
||||
assert len(data["results"]) <= max_pages
|
||||
# Primary assertion is that the crawl succeeded *with* proxy config
|
||||
print(f"Proxy deep crawl completed successfully for {len(data['results'])} pages.")
|
||||
|
||||
# Verifying specific proxy usage requires server logs or custom headers/responses
|
||||
|
||||
|
||||
# --- Main Execution Block (for running script directly) ---
|
||||
if __name__ == "__main__":
|
||||
pytest_args = ["-v", "-s", __file__]
|
||||
# Example: Run only proxy test
|
||||
# pytest_args.append("-k test_deep_crawl_with_proxies")
|
||||
print(f"Running pytest with args: {pytest_args}")
|
||||
exit_code = pytest.main(pytest_args)
|
||||
print(f"Pytest finished with exit code: {exit_code}")
|
||||
255
tests/docker/test_serialization.py
Normal file
255
tests/docker/test_serialization.py
Normal file
@@ -0,0 +1,255 @@
|
||||
import inspect
|
||||
from typing import Any, Dict
|
||||
from enum import Enum
|
||||
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
def to_serializable_dict(obj: Any) -> Dict:
|
||||
"""
|
||||
Recursively convert an object to a serializable dictionary using {type, params} structure
|
||||
for complex objects.
|
||||
"""
|
||||
if obj is None:
|
||||
return None
|
||||
|
||||
# Handle basic types
|
||||
if isinstance(obj, (str, int, float, bool)):
|
||||
return obj
|
||||
|
||||
# Handle Enum
|
||||
if isinstance(obj, Enum):
|
||||
return {
|
||||
"type": obj.__class__.__name__,
|
||||
"params": obj.value
|
||||
}
|
||||
|
||||
# Handle datetime objects
|
||||
if hasattr(obj, 'isoformat'):
|
||||
return obj.isoformat()
|
||||
|
||||
# Handle lists, tuples, and sets
|
||||
if isinstance(obj, (list, tuple, set)):
|
||||
return [to_serializable_dict(item) for item in obj]
|
||||
|
||||
# Handle dictionaries - preserve them as-is
|
||||
if isinstance(obj, dict):
|
||||
return {
|
||||
"type": "dict", # Mark as plain dictionary
|
||||
"value": {str(k): to_serializable_dict(v) for k, v in obj.items()}
|
||||
}
|
||||
|
||||
# Handle class instances
|
||||
if hasattr(obj, '__class__'):
|
||||
# Get constructor signature
|
||||
sig = inspect.signature(obj.__class__.__init__)
|
||||
params = sig.parameters
|
||||
|
||||
# Get current values
|
||||
current_values = {}
|
||||
for name, param in params.items():
|
||||
if name == 'self':
|
||||
continue
|
||||
|
||||
value = getattr(obj, name, param.default)
|
||||
|
||||
# Only include if different from default, considering empty values
|
||||
if not (is_empty_value(value) and is_empty_value(param.default)):
|
||||
if value != param.default:
|
||||
current_values[name] = to_serializable_dict(value)
|
||||
|
||||
return {
|
||||
"type": obj.__class__.__name__,
|
||||
"params": current_values
|
||||
}
|
||||
|
||||
return str(obj)
|
||||
|
||||
def from_serializable_dict(data: Any) -> Any:
|
||||
"""
|
||||
Recursively convert a serializable dictionary back to an object instance.
|
||||
"""
|
||||
if data is None:
|
||||
return None
|
||||
|
||||
# Handle basic types
|
||||
if isinstance(data, (str, int, float, bool)):
|
||||
return data
|
||||
|
||||
# Handle typed data
|
||||
if isinstance(data, dict) and "type" in data:
|
||||
# Handle plain dictionaries
|
||||
if data["type"] == "dict":
|
||||
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
|
||||
|
||||
# Import from crawl4ai for class instances
|
||||
import crawl4ai
|
||||
cls = getattr(crawl4ai, data["type"])
|
||||
|
||||
# Handle Enum
|
||||
if issubclass(cls, Enum):
|
||||
return cls(data["params"])
|
||||
|
||||
# Handle class instances
|
||||
constructor_args = {
|
||||
k: from_serializable_dict(v) for k, v in data["params"].items()
|
||||
}
|
||||
return cls(**constructor_args)
|
||||
|
||||
# Handle lists
|
||||
if isinstance(data, list):
|
||||
return [from_serializable_dict(item) for item in data]
|
||||
|
||||
# Handle raw dictionaries (legacy support)
|
||||
if isinstance(data, dict):
|
||||
return {k: from_serializable_dict(v) for k, v in data.items()}
|
||||
|
||||
return data
|
||||
|
||||
def is_empty_value(value: Any) -> bool:
|
||||
"""Check if a value is effectively empty/null."""
|
||||
if value is None:
|
||||
return True
|
||||
if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# from crawl4ai import (
|
||||
# CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator,
|
||||
# PruningContentFilter, BM25ContentFilter, LLMContentFilter,
|
||||
# JsonCssExtractionStrategy, CosineStrategy, RegexChunking,
|
||||
# WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
# )
|
||||
|
||||
# # Test Case 1: BM25 content filtering through markdown generator
|
||||
# config1 = CrawlerRunConfig(
|
||||
# cache_mode=CacheMode.BYPASS,
|
||||
# markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=BM25ContentFilter(
|
||||
# user_query="technology articles",
|
||||
# bm25_threshold=1.2,
|
||||
# language="english"
|
||||
# )
|
||||
# ),
|
||||
# chunking_strategy=RegexChunking(patterns=[r"\n\n", r"\.\s+"]),
|
||||
# excluded_tags=["nav", "footer", "aside"],
|
||||
# remove_overlay_elements=True
|
||||
# )
|
||||
|
||||
# # Serialize
|
||||
# serialized = to_serializable_dict(config1)
|
||||
# print("\nSerialized Config:")
|
||||
# print(serialized)
|
||||
|
||||
# # Example output structure would now look like:
|
||||
# """
|
||||
# {
|
||||
# "type": "CrawlerRunConfig",
|
||||
# "params": {
|
||||
# "cache_mode": {
|
||||
# "type": "CacheMode",
|
||||
# "params": "bypass"
|
||||
# },
|
||||
# "markdown_generator": {
|
||||
# "type": "DefaultMarkdownGenerator",
|
||||
# "params": {
|
||||
# "content_filter": {
|
||||
# "type": "BM25ContentFilter",
|
||||
# "params": {
|
||||
# "user_query": "technology articles",
|
||||
# "bm25_threshold": 1.2,
|
||||
# "language": "english"
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# """
|
||||
|
||||
# # Deserialize
|
||||
# deserialized = from_serializable_dict(serialized)
|
||||
# print("\nDeserialized Config:")
|
||||
# print(to_serializable_dict(deserialized))
|
||||
|
||||
# # Verify they match
|
||||
# assert to_serializable_dict(config1) == to_serializable_dict(deserialized)
|
||||
# print("\nVerification passed: Configuration matches after serialization/deserialization!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
from crawl4ai import (
|
||||
CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator,
|
||||
PruningContentFilter, BM25ContentFilter, LLMContentFilter,
|
||||
JsonCssExtractionStrategy, RegexChunking,
|
||||
WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
)
|
||||
|
||||
# Test Case 1: BM25 content filtering through markdown generator
|
||||
config1 = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=BM25ContentFilter(
|
||||
user_query="technology articles",
|
||||
bm25_threshold=1.2,
|
||||
language="english"
|
||||
)
|
||||
),
|
||||
chunking_strategy=RegexChunking(patterns=[r"\n\n", r"\.\s+"]),
|
||||
excluded_tags=["nav", "footer", "aside"],
|
||||
remove_overlay_elements=True
|
||||
)
|
||||
|
||||
# Test Case 2: LLM-based extraction with pruning filter
|
||||
schema = {
|
||||
"baseSelector": "article.post",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1", "type": "text"},
|
||||
{"name": "content", "selector": ".content", "type": "html"}
|
||||
]
|
||||
}
|
||||
config2 = CrawlerRunConfig(
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema=schema),
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48,
|
||||
threshold_type="fixed",
|
||||
min_word_threshold=0
|
||||
),
|
||||
options={"ignore_links": True}
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy()
|
||||
)
|
||||
|
||||
# Test Case 3:LLM content filter
|
||||
config3 = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(
|
||||
llm_config = LLMConfig(provider="openai/gpt-4"),
|
||||
instruction="Extract key technical concepts",
|
||||
chunk_token_threshold=2000,
|
||||
overlap_rate=0.1
|
||||
),
|
||||
options={"ignore_images": True}
|
||||
),
|
||||
scraping_strategy=WebScrapingStrategy()
|
||||
)
|
||||
|
||||
# Test all configurations
|
||||
test_configs = [config1, config2, config3]
|
||||
|
||||
for i, config in enumerate(test_configs, 1):
|
||||
print(f"\nTesting Configuration {i}:")
|
||||
|
||||
# Serialize
|
||||
serialized = to_serializable_dict(config)
|
||||
print(f"\nSerialized Config {i}:")
|
||||
print(serialized)
|
||||
|
||||
# Deserialize
|
||||
deserialized = from_serializable_dict(serialized)
|
||||
print(f"\nDeserialized Config {i}:")
|
||||
print(to_serializable_dict(deserialized)) # Convert back to dict for comparison
|
||||
|
||||
# Verify they match
|
||||
assert to_serializable_dict(config) == to_serializable_dict(deserialized)
|
||||
print(f"\nVerification passed: Configuration {i} matches after serialization/deserialization!")
|
||||
146
tests/docker/test_server.py
Normal file
146
tests/docker/test_server.py
Normal file
@@ -0,0 +1,146 @@
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Optional
|
||||
from urllib.parse import quote
|
||||
|
||||
async def test_endpoint(
|
||||
endpoint: str,
|
||||
url: str,
|
||||
params: Optional[dict] = None,
|
||||
expected_status: int = 200
|
||||
) -> None:
|
||||
"""Test an endpoint and print results"""
|
||||
import aiohttp
|
||||
|
||||
params = params or {}
|
||||
param_str = "&".join(f"{k}={v}" for k, v in params.items())
|
||||
full_url = f"http://localhost:8000/{endpoint}/{quote(url)}"
|
||||
if param_str:
|
||||
full_url += f"?{param_str}"
|
||||
|
||||
print(f"\nTesting: {full_url}")
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(full_url) as response:
|
||||
status = response.status
|
||||
try:
|
||||
data = await response.json()
|
||||
except:
|
||||
data = await response.text()
|
||||
|
||||
print(f"Status: {status} (Expected: {expected_status})")
|
||||
if isinstance(data, dict):
|
||||
print(f"Response: {json.dumps(data, indent=2)}")
|
||||
else:
|
||||
print(f"Response: {data[:500]}...") # First 500 chars
|
||||
assert status == expected_status
|
||||
return data
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
return None
|
||||
|
||||
async def test_llm_task_completion(task_id: str) -> None:
|
||||
"""Poll task until completion"""
|
||||
for _ in range(10): # Try 10 times
|
||||
result = await test_endpoint("llm", task_id)
|
||||
if result and result.get("status") in ["completed", "failed"]:
|
||||
return result
|
||||
print("Task still processing, waiting 5 seconds...")
|
||||
await asyncio.sleep(5)
|
||||
print("Task timed out")
|
||||
|
||||
async def run_tests():
|
||||
print("Starting API Tests...")
|
||||
|
||||
# Test URLs
|
||||
urls = [
|
||||
"example.com",
|
||||
"https://www.python.org",
|
||||
"https://news.ycombinator.com/news",
|
||||
"https://github.com/trending"
|
||||
]
|
||||
|
||||
print("\n=== Testing Markdown Endpoint ===")
|
||||
for url in[] : #urls:
|
||||
# Test different filter types
|
||||
for filter_type in ["raw", "fit", "bm25", "llm"]:
|
||||
params = {"f": filter_type}
|
||||
if filter_type in ["bm25", "llm"]:
|
||||
params["q"] = "extract main content"
|
||||
|
||||
# Test with and without cache
|
||||
for cache in ["0", "1"]:
|
||||
params["c"] = cache
|
||||
await test_endpoint("md", url, params)
|
||||
await asyncio.sleep(1) # Be nice to the server
|
||||
|
||||
print("\n=== Testing LLM Endpoint ===")
|
||||
for url in []: # urls:
|
||||
# Test basic extraction
|
||||
result = await test_endpoint(
|
||||
"llm",
|
||||
url,
|
||||
{"q": "Extract title and main content"}
|
||||
)
|
||||
if result and "task_id" in result:
|
||||
print("\nChecking task completion...")
|
||||
await test_llm_task_completion(result["task_id"])
|
||||
|
||||
# Test with schema
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"content": {"type": "string"},
|
||||
"links": {"type": "array", "items": {"type": "string"}}
|
||||
}
|
||||
}
|
||||
result = await test_endpoint(
|
||||
"llm",
|
||||
url,
|
||||
{
|
||||
"q": "Extract content with links",
|
||||
"s": json.dumps(schema),
|
||||
"c": "1" # Test with cache
|
||||
}
|
||||
)
|
||||
if result and "task_id" in result:
|
||||
print("\nChecking schema task completion...")
|
||||
await test_llm_task_completion(result["task_id"])
|
||||
|
||||
await asyncio.sleep(2) # Be nice to the server
|
||||
|
||||
print("\n=== Testing Error Cases ===")
|
||||
# Test invalid URL
|
||||
await test_endpoint(
|
||||
"md",
|
||||
"not_a_real_url",
|
||||
expected_status=500
|
||||
)
|
||||
|
||||
# Test invalid filter type
|
||||
await test_endpoint(
|
||||
"md",
|
||||
"example.com",
|
||||
{"f": "invalid"},
|
||||
expected_status=422
|
||||
)
|
||||
|
||||
# Test LLM without query
|
||||
await test_endpoint(
|
||||
"llm",
|
||||
"example.com"
|
||||
)
|
||||
|
||||
# Test invalid task ID
|
||||
await test_endpoint(
|
||||
"llm",
|
||||
"llm_invalid_task",
|
||||
expected_status=404
|
||||
)
|
||||
|
||||
print("\nAll tests completed!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
655
tests/docker/test_server_requests.py
Normal file
655
tests/docker/test_server_requests.py
Normal file
@@ -0,0 +1,655 @@
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import httpx
|
||||
import json
|
||||
import asyncio
|
||||
import os
|
||||
from typing import List, Dict, Any, AsyncGenerator
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# Optional: Import crawl4ai classes directly for reference/easier payload creation aid
|
||||
# You don't strictly NEED these imports for the tests to run against the server,
|
||||
# but they help in understanding the structure you are mimicking in JSON.
|
||||
from crawl4ai import (
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
DefaultMarkdownGenerator,
|
||||
PruningContentFilter,
|
||||
BM25ContentFilter,
|
||||
BFSDeepCrawlStrategy,
|
||||
FilterChain,
|
||||
ContentTypeFilter,
|
||||
DomainFilter,
|
||||
CompositeScorer,
|
||||
KeywordRelevanceScorer,
|
||||
PathDepthScorer,
|
||||
JsonCssExtractionStrategy,
|
||||
LLMExtractionStrategy,
|
||||
LLMConfig
|
||||
)
|
||||
|
||||
# --- Test Configuration ---
|
||||
# BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
|
||||
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Make base URL configurable
|
||||
# Use a known simple HTML page for basic tests
|
||||
SIMPLE_HTML_URL = "https://httpbin.org/html"
|
||||
# Use a site suitable for scraping tests
|
||||
SCRAPE_TARGET_URL = "http://books.toscrape.com/"
|
||||
# Use a site with internal links for deep crawl tests
|
||||
DEEP_CRAWL_URL = "https://python.org"
|
||||
|
||||
# --- Pytest Fixtures ---
|
||||
|
||||
# Use the built-in event_loop fixture from pytest_asyncio
|
||||
# The custom implementation was causing issues with closing the loop
|
||||
|
||||
@pytest_asyncio.fixture(scope="function") # Changed to function scope to avoid event loop issues
|
||||
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
|
||||
"""Provides an async HTTP client"""
|
||||
client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0)
|
||||
yield client
|
||||
await client.aclose()
|
||||
|
||||
# --- Helper Functions ---
|
||||
|
||||
async def check_server_health(client: httpx.AsyncClient):
|
||||
"""Check if the server is healthy before running tests."""
|
||||
try:
|
||||
response = await client.get("/health")
|
||||
response.raise_for_status()
|
||||
print(f"\nServer healthy: {response.json()}")
|
||||
return True
|
||||
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
||||
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
|
||||
|
||||
async def assert_crawl_result_structure(result: Dict[str, Any]):
|
||||
"""Asserts the basic structure of a single crawl result."""
|
||||
assert isinstance(result, dict)
|
||||
assert "url" in result
|
||||
assert "success" in result
|
||||
assert "html" in result
|
||||
# Add more common checks if needed
|
||||
|
||||
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
|
||||
"""Processes an NDJSON streaming response."""
|
||||
results = []
|
||||
completed = False
|
||||
async for line in response.aiter_lines():
|
||||
if line:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get("status") == "completed":
|
||||
completed = True
|
||||
break # Stop processing after completion marker
|
||||
else:
|
||||
results.append(data)
|
||||
except json.JSONDecodeError:
|
||||
pytest.fail(f"Failed to decode JSON line: {line}")
|
||||
assert completed, "Streaming response did not end with a completion marker."
|
||||
return results
|
||||
|
||||
|
||||
# --- Test Class ---
|
||||
|
||||
@pytest.mark.asyncio
|
||||
class TestCrawlEndpoints:
|
||||
|
||||
@pytest_asyncio.fixture(autouse=True)
|
||||
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
|
||||
"""Fixture to ensure server is healthy before each test in the class."""
|
||||
await check_server_health(async_client)
|
||||
|
||||
# 1. Simple Requests (Primitives)
|
||||
async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl with a single URL and simple config values."""
|
||||
payload = {
|
||||
"urls": [SIMPLE_HTML_URL],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": True,
|
||||
}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False, # Explicitly false for /crawl
|
||||
"screenshot": False,
|
||||
"cache_mode": CacheMode.BYPASS.value # Use enum value
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
print(f"Response status: {response.status_code}")
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Server error: {e}")
|
||||
print(f"Response content: {e.response.text}")
|
||||
raise
|
||||
|
||||
assert data["success"] is True
|
||||
assert isinstance(data["results"], list)
|
||||
assert len(data["results"]) == 1
|
||||
result = data["results"][0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert result["url"] == SIMPLE_HTML_URL
|
||||
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
|
||||
# It might be null, missing, or populated depending on the server's default behavior
|
||||
|
||||
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl/stream with a single URL and simple config values."""
|
||||
payload = {
|
||||
"urls": [SIMPLE_HTML_URL],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": True,
|
||||
}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": True, # Must be true for /crawl/stream
|
||||
"screenshot": False,
|
||||
"cache_mode": CacheMode.BYPASS.value
|
||||
}
|
||||
}
|
||||
}
|
||||
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
|
||||
response.raise_for_status()
|
||||
results = await process_streaming_response(response)
|
||||
|
||||
assert len(results) == 1
|
||||
result = results[0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert result["url"] == SIMPLE_HTML_URL
|
||||
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||
|
||||
|
||||
# 2. Multi-URL and Dispatcher
|
||||
async def test_multi_url_crawl(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl with multiple URLs, implicitly testing dispatcher."""
|
||||
urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
|
||||
payload = {
|
||||
"urls": urls,
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {"headless": True}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {"stream": False, "cache_mode": CacheMode.BYPASS.value}
|
||||
}
|
||||
}
|
||||
try:
|
||||
print(f"Sending deep crawl request to server...")
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
print(f"Response status: {response.status_code}")
|
||||
|
||||
if response.status_code >= 400:
|
||||
error_detail = response.json().get('detail', 'No detail provided')
|
||||
print(f"Error detail: {error_detail}")
|
||||
print(f"Full response: {response.text}")
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Server error status: {e.response.status_code}")
|
||||
print(f"Server error response: {e.response.text}")
|
||||
try:
|
||||
error_json = e.response.json()
|
||||
print(f"Parsed error: {error_json}")
|
||||
except:
|
||||
print("Could not parse error response as JSON")
|
||||
raise
|
||||
|
||||
assert data["success"] is True
|
||||
assert isinstance(data["results"], list)
|
||||
assert len(data["results"]) == len(urls)
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert result["url"] in urls
|
||||
|
||||
async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl/stream with multiple URLs."""
|
||||
urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
|
||||
payload = {
|
||||
"urls": urls,
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {"headless": True}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {"stream": True, "cache_mode": CacheMode.BYPASS.value}
|
||||
}
|
||||
}
|
||||
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
|
||||
response.raise_for_status()
|
||||
results = await process_streaming_response(response)
|
||||
|
||||
assert len(results) == len(urls)
|
||||
processed_urls = set()
|
||||
for result in results:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert result["url"] in urls
|
||||
processed_urls.add(result["url"])
|
||||
assert processed_urls == set(urls) # Ensure all URLs were processed
|
||||
|
||||
|
||||
# 3. Class Values and Nested Classes (Markdown Generator)
|
||||
async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl with MarkdownGenerator using PruningContentFilter."""
|
||||
payload = {
|
||||
"urls": [SIMPLE_HTML_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"cache_mode": CacheMode.ENABLED.value, # Test different cache mode
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.5, # Example param
|
||||
"threshold_type": "relative"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
print(f"Sending deep crawl request to server...")
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
print(f"Response status: {response.status_code}")
|
||||
|
||||
if response.status_code >= 400:
|
||||
error_detail = response.json().get('detail', 'No detail provided')
|
||||
print(f"Error detail: {error_detail}")
|
||||
print(f"Full response: {response.text}")
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Server error status: {e.response.status_code}")
|
||||
print(f"Server error response: {e.response.text}")
|
||||
try:
|
||||
error_json = e.response.json()
|
||||
print(f"Parsed error: {error_json}")
|
||||
except:
|
||||
print("Could not parse error response as JSON")
|
||||
raise
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) == 1
|
||||
result = data["results"][0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert "markdown" in result
|
||||
assert isinstance(result["markdown"], dict)
|
||||
assert "raw_markdown" in result["markdown"]
|
||||
assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown
|
||||
assert "Moby-Dick" in result["markdown"]["raw_markdown"]
|
||||
# Fit markdown content might be different/shorter due to pruning
|
||||
assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"])
|
||||
|
||||
async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl with MarkdownGenerator using BM25ContentFilter."""
|
||||
payload = {
|
||||
"urls": [SIMPLE_HTML_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "BM25ContentFilter",
|
||||
"params": {
|
||||
"user_query": "Herman Melville", # Query for BM25
|
||||
"bm25_threshold": 0.1, # Lower threshold to increase matches
|
||||
"language": "english" # Valid parameters
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
print(f"Payload for BM25 test: {json.dumps(payload)}")
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
print(f"Response status: {response.status_code}")
|
||||
|
||||
if response.status_code >= 400:
|
||||
error_detail = response.json().get('detail', 'No detail provided')
|
||||
print(f"Error detail: {error_detail}")
|
||||
print(f"Full response: {response.text}")
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Server error status: {e.response.status_code}")
|
||||
print(f"Server error response: {e.response.text}")
|
||||
try:
|
||||
error_json = e.response.json()
|
||||
print(f"Parsed error: {error_json}")
|
||||
except:
|
||||
print("Could not parse error response as JSON")
|
||||
raise
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) == 1
|
||||
result = data["results"][0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert "markdown" in result
|
||||
assert isinstance(result["markdown"], dict)
|
||||
assert "raw_markdown" in result["markdown"]
|
||||
assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown
|
||||
|
||||
# Print values for debug
|
||||
print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}")
|
||||
print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}")
|
||||
|
||||
# Either fit_markdown has content (possibly including our query terms)
|
||||
# or it might be empty if no good BM25 matches were found
|
||||
# Don't assert specific content since it can be environment-dependent
|
||||
|
||||
|
||||
# 4. Deep Crawling
|
||||
async def test_deep_crawl(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl with a deep crawl strategy."""
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_URL], # Start URL
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": CacheMode.BYPASS.value,
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": 1, # Limit depth for testing speed
|
||||
"max_pages": 5, # Limit pages to crawl
|
||||
"filter_chain": {
|
||||
"type": "FilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{
|
||||
"type": "ContentTypeFilter",
|
||||
"params": {"allowed_types": ["text/html"]}
|
||||
},
|
||||
{
|
||||
"type": "DomainFilter",
|
||||
"params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"url_scorer": {
|
||||
"type": "CompositeScorer",
|
||||
"params": {
|
||||
"scorers": [
|
||||
{
|
||||
"type": "KeywordRelevanceScorer",
|
||||
"params": {"keywords": ["documentation", "tutorial"]}
|
||||
},
|
||||
{
|
||||
"type": "PathDepthScorer",
|
||||
"params": {"weight": 0.5, "optimal_depth": 2}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
print(f"Sending deep crawl request to server...")
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
print(f"Response status: {response.status_code}")
|
||||
|
||||
if response.status_code >= 400:
|
||||
error_detail = response.json().get('detail', 'No detail provided')
|
||||
print(f"Error detail: {error_detail}")
|
||||
print(f"Full response: {response.text}")
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Server error status: {e.response.status_code}")
|
||||
print(f"Server error response: {e.response.text}")
|
||||
try:
|
||||
error_json = e.response.json()
|
||||
print(f"Parsed error: {error_json}")
|
||||
except:
|
||||
print("Could not parse error response as JSON")
|
||||
raise
|
||||
|
||||
assert data["success"] is True
|
||||
assert isinstance(data["results"], list)
|
||||
# Expect more than 1 result due to deep crawl (start URL + crawled links)
|
||||
assert len(data["results"]) > 1
|
||||
assert len(data["results"]) <= 6 # Start URL + max_links=5
|
||||
|
||||
start_url_found = False
|
||||
crawled_urls_found = False
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
|
||||
# Print URL for debugging
|
||||
print(f"Crawled URL: {result['url']}")
|
||||
|
||||
# Allow URLs that contain python.org (including subdomains like docs.python.org)
|
||||
assert "python.org" in result["url"]
|
||||
if result["url"] == DEEP_CRAWL_URL:
|
||||
start_url_found = True
|
||||
else:
|
||||
crawled_urls_found = True
|
||||
|
||||
assert start_url_found
|
||||
assert crawled_urls_found
|
||||
|
||||
|
||||
# 5. Extraction without LLM (JSON/CSS)
|
||||
async def test_json_css_extraction(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl with JsonCssExtractionStrategy."""
|
||||
payload = {
|
||||
"urls": [SCRAPE_TARGET_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"cache_mode": CacheMode.BYPASS.value,
|
||||
"extraction_strategy": {
|
||||
"type": "JsonCssExtractionStrategy",
|
||||
"params": {
|
||||
"schema": {
|
||||
"type": "dict", # IMPORTANT: Wrap schema dict with type/value structure
|
||||
"value": {
|
||||
"name": "BookList",
|
||||
"baseSelector": "ol.row li.col-xs-6", # Select each book item
|
||||
"fields": [
|
||||
{"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
|
||||
{"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
|
||||
{"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
print(f"Sending deep crawl request to server...")
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
print(f"Response status: {response.status_code}")
|
||||
|
||||
if response.status_code >= 400:
|
||||
error_detail = response.json().get('detail', 'No detail provided')
|
||||
print(f"Error detail: {error_detail}")
|
||||
print(f"Full response: {response.text}")
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Server error status: {e.response.status_code}")
|
||||
print(f"Server error response: {e.response.text}")
|
||||
try:
|
||||
error_json = e.response.json()
|
||||
print(f"Parsed error: {error_json}")
|
||||
except:
|
||||
print("Could not parse error response as JSON")
|
||||
raise
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) == 1
|
||||
result = data["results"][0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert "extracted_content" in result
|
||||
assert result["extracted_content"] is not None
|
||||
|
||||
# Extracted content should be a JSON string representing a list of dicts
|
||||
try:
|
||||
extracted_data = json.loads(result["extracted_content"])
|
||||
assert isinstance(extracted_data, list)
|
||||
assert len(extracted_data) > 0 # Should find some books
|
||||
# Check structure of the first extracted item
|
||||
first_item = extracted_data[0]
|
||||
assert "title" in first_item
|
||||
assert "price" in first_item
|
||||
assert "rating" in first_item
|
||||
assert "star-rating" in first_item["rating"] # e.g., "star-rating Three"
|
||||
except (json.JSONDecodeError, AssertionError) as e:
|
||||
pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
|
||||
|
||||
|
||||
# 6. Extraction with LLM
|
||||
async def test_llm_extraction(self, async_client: httpx.AsyncClient):
|
||||
"""
|
||||
Test /crawl with LLMExtractionStrategy.
|
||||
NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY)
|
||||
configured via .llm.env or environment variables.
|
||||
This test uses the default provider configured in the server's config.yml.
|
||||
"""
|
||||
payload = {
|
||||
"urls": [SIMPLE_HTML_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"cache_mode": CacheMode.BYPASS.value,
|
||||
"extraction_strategy": {
|
||||
"type": "LLMExtractionStrategy",
|
||||
"params": {
|
||||
"instruction": "Extract the main title and the author mentioned in the text into JSON.",
|
||||
# LLMConfig is implicitly defined by server's config.yml and .llm.env
|
||||
# If you needed to override provider/token PER REQUEST:
|
||||
"llm_config": {
|
||||
"type": "LLMConfig",
|
||||
"params": {
|
||||
"provider": "openai/gpt-4o", # Example override
|
||||
"api_token": os.getenv("OPENAI_API_KEY") # Example override
|
||||
}
|
||||
},
|
||||
"schema": { # Optional: Provide a schema for structured output
|
||||
"type": "dict", # IMPORTANT: Wrap schema dict
|
||||
"value": {
|
||||
"title": "Book Info",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string", "description": "The main title of the work"},
|
||||
"author": {"type": "string", "description": "The author of the work"}
|
||||
},
|
||||
"required": ["title", "author"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key)
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
# Catch potential server errors (like 500 due to missing/invalid API keys)
|
||||
pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.")
|
||||
except httpx.RequestError as e:
|
||||
pytest.fail(f"LLM extraction request failed: {e}.")
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) == 1
|
||||
result = data["results"][0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert "extracted_content" in result
|
||||
assert result["extracted_content"] is not None
|
||||
|
||||
# Extracted content should be JSON (because we provided a schema)
|
||||
try:
|
||||
extracted_data = json.loads(result["extracted_content"])
|
||||
print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification
|
||||
|
||||
# Handle both dict and list formats (server returns a list)
|
||||
if isinstance(extracted_data, list):
|
||||
assert len(extracted_data) > 0
|
||||
extracted_item = extracted_data[0] # Take first item
|
||||
assert isinstance(extracted_item, dict)
|
||||
assert "title" in extracted_item
|
||||
assert "author" in extracted_item
|
||||
assert "Moby-Dick" in extracted_item.get("title", "")
|
||||
assert "Herman Melville" in extracted_item.get("author", "")
|
||||
else:
|
||||
assert isinstance(extracted_data, dict)
|
||||
assert "title" in extracted_data
|
||||
assert "author" in extracted_data
|
||||
assert "Moby-Dick" in extracted_data.get("title", "")
|
||||
assert "Herman Melville" in extracted_data.get("author", "")
|
||||
except (json.JSONDecodeError, AssertionError) as e:
|
||||
pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
|
||||
except Exception as e: # Catch any other unexpected error
|
||||
pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Define arguments for pytest programmatically
|
||||
# -v: verbose output
|
||||
# -s: show print statements immediately (useful for debugging)
|
||||
# __file__: tells pytest to run tests in the current file
|
||||
pytest_args = ["-v", "-s", __file__]
|
||||
|
||||
# You can add more pytest arguments here if needed, for example:
|
||||
# '-k test_llm_extraction': Run only the LLM test function
|
||||
# pytest_args.append("-k test_llm_extraction")
|
||||
|
||||
print(f"Running pytest with args: {pytest_args}")
|
||||
|
||||
# Execute pytest
|
||||
exit_code = pytest.main(pytest_args)
|
||||
|
||||
print(f"Pytest finished with exit code: {exit_code}")
|
||||
212
tests/docker/test_server_token.py
Normal file
212
tests/docker/test_server_token.py
Normal file
@@ -0,0 +1,212 @@
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Optional
|
||||
from urllib.parse import quote
|
||||
|
||||
async def get_token(session, email: str = "test@example.com") -> str:
|
||||
"""Fetch a JWT token from the /token endpoint."""
|
||||
url = "http://localhost:8000/token"
|
||||
payload = {"email": email}
|
||||
print(f"\nFetching token from {url} with email: {email}")
|
||||
try:
|
||||
async with session.post(url, json=payload) as response:
|
||||
status = response.status
|
||||
data = await response.json()
|
||||
print(f"Token Response Status: {status}")
|
||||
print(f"Token Response: {json.dumps(data, indent=2)}")
|
||||
if status == 200:
|
||||
return data["access_token"]
|
||||
else:
|
||||
raise Exception(f"Failed to get token: {data.get('detail', 'Unknown error')}")
|
||||
except Exception as e:
|
||||
print(f"Error fetching token: {str(e)}")
|
||||
raise
|
||||
|
||||
async def test_endpoint(
|
||||
session,
|
||||
endpoint: str,
|
||||
url: str,
|
||||
token: str,
|
||||
params: Optional[dict] = None,
|
||||
expected_status: int = 200
|
||||
) -> Optional[dict]:
|
||||
"""Test an endpoint with token and print results."""
|
||||
params = params or {}
|
||||
param_str = "&".join(f"{k}={v}" for k, v in params.items())
|
||||
full_url = f"http://localhost:8000/{endpoint}/{quote(url)}"
|
||||
if param_str:
|
||||
full_url += f"?{param_str}"
|
||||
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
print(f"\nTesting: {full_url}")
|
||||
|
||||
try:
|
||||
async with session.get(full_url, headers=headers) as response:
|
||||
status = response.status
|
||||
try:
|
||||
data = await response.json()
|
||||
except:
|
||||
data = await response.text()
|
||||
|
||||
print(f"Status: {status} (Expected: {expected_status})")
|
||||
if isinstance(data, dict):
|
||||
print(f"Response: {json.dumps(data, indent=2)}")
|
||||
else:
|
||||
print(f"Response: {data[:500]}...") # First 500 chars
|
||||
assert status == expected_status, f"Expected {expected_status}, got {status}"
|
||||
return data
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
async def test_stream_crawl(session, token: str):
|
||||
"""Test the /crawl/stream endpoint with multiple URLs."""
|
||||
url = "http://localhost:8000/crawl/stream"
|
||||
payload = {
|
||||
"urls": [
|
||||
"https://example.com",
|
||||
"https://example.com/page1", # Replicated example.com with variation
|
||||
"https://example.com/page2", # Replicated example.com with variation
|
||||
"https://example.com/page3", # Replicated example.com with variation
|
||||
# "https://www.python.org",
|
||||
# "https://news.ycombinator.com/news"
|
||||
],
|
||||
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
||||
"crawler_config": {"stream": True, "cache_mode": "bypass"}
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
print(f"\nTesting Streaming Crawl: {url}")
|
||||
print(f"Payload: {json.dumps(payload, indent=2)}")
|
||||
|
||||
try:
|
||||
async with session.post(url, json=payload, headers=headers) as response:
|
||||
status = response.status
|
||||
print(f"Status: {status} (Expected: 200)")
|
||||
assert status == 200, f"Expected 200, got {status}"
|
||||
|
||||
# Read streaming response line-by-line (NDJSON)
|
||||
async for line in response.content:
|
||||
if line:
|
||||
data = json.loads(line.decode('utf-8').strip())
|
||||
print(f"Streamed Result: {json.dumps(data, indent=2)}")
|
||||
except Exception as e:
|
||||
print(f"Error in streaming crawl test: {str(e)}")
|
||||
|
||||
async def run_tests():
|
||||
import aiohttp
|
||||
print("Starting API Tests...")
|
||||
|
||||
# Test URLs
|
||||
urls = [
|
||||
"example.com",
|
||||
"https://www.python.org",
|
||||
"https://news.ycombinator.com/news",
|
||||
"https://github.com/trending"
|
||||
]
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Fetch token once and reuse it
|
||||
token = await get_token(session)
|
||||
if not token:
|
||||
print("Aborting tests due to token failure!")
|
||||
return
|
||||
|
||||
print("\n=== Testing Crawl Endpoint ===")
|
||||
crawl_payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {"stream": False}
|
||||
}
|
||||
async with session.post(
|
||||
"http://localhost:8000/crawl",
|
||||
json=crawl_payload,
|
||||
headers={"Authorization": f"Bearer {token}"}
|
||||
) as response:
|
||||
status = response.status
|
||||
data = await response.json()
|
||||
print(f"\nCrawl Endpoint Status: {status}")
|
||||
print(f"Crawl Response: {json.dumps(data, indent=2)}")
|
||||
|
||||
|
||||
print("\n=== Testing Crawl Stream Endpoint ===")
|
||||
await test_stream_crawl(session, token)
|
||||
|
||||
print("\n=== Testing Markdown Endpoint ===")
|
||||
for url in []: #urls:
|
||||
for filter_type in ["raw", "fit", "bm25", "llm"]:
|
||||
params = {"f": filter_type}
|
||||
if filter_type in ["bm25", "llm"]:
|
||||
params["q"] = "extract main content"
|
||||
|
||||
for cache in ["0", "1"]:
|
||||
params["c"] = cache
|
||||
await test_endpoint(session, "md", url, token, params)
|
||||
await asyncio.sleep(1) # Be nice to the server
|
||||
|
||||
print("\n=== Testing LLM Endpoint ===")
|
||||
for url in urls:
|
||||
# Test basic extraction (direct response now)
|
||||
result = await test_endpoint(
|
||||
session,
|
||||
"llm",
|
||||
url,
|
||||
token,
|
||||
{"q": "Extract title and main content"}
|
||||
)
|
||||
|
||||
# Test with schema (direct response)
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"content": {"type": "string"},
|
||||
"links": {"type": "array", "items": {"type": "string"}}
|
||||
}
|
||||
}
|
||||
result = await test_endpoint(
|
||||
session,
|
||||
"llm",
|
||||
url,
|
||||
token,
|
||||
{
|
||||
"q": "Extract content with links",
|
||||
"s": json.dumps(schema),
|
||||
"c": "1" # Test with cache
|
||||
}
|
||||
)
|
||||
await asyncio.sleep(2) # Be nice to the server
|
||||
|
||||
print("\n=== Testing Error Cases ===")
|
||||
# Test invalid URL
|
||||
await test_endpoint(
|
||||
session,
|
||||
"md",
|
||||
"not_a_real_url",
|
||||
token,
|
||||
expected_status=500
|
||||
)
|
||||
|
||||
# Test invalid filter type
|
||||
await test_endpoint(
|
||||
session,
|
||||
"md",
|
||||
"example.com",
|
||||
token,
|
||||
{"f": "invalid"},
|
||||
expected_status=422
|
||||
)
|
||||
|
||||
# Test LLM without query (should fail per your server logic)
|
||||
await test_endpoint(
|
||||
session,
|
||||
"llm",
|
||||
"example.com",
|
||||
token,
|
||||
expected_status=400
|
||||
)
|
||||
|
||||
print("\nAll tests completed!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
335
tests/general/generate_dummy_site.py
Normal file
335
tests/general/generate_dummy_site.py
Normal file
@@ -0,0 +1,335 @@
|
||||
# ==== File: build_dummy_site.py ====
|
||||
|
||||
import os
|
||||
import random
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
|
||||
# --- Configuration ---
|
||||
NUM_CATEGORIES = 3
|
||||
NUM_SUBCATEGORIES_PER_CAT = 2 # Results in NUM_CATEGORIES * NUM_SUBCATEGORIES_PER_CAT total L2 categories
|
||||
NUM_PRODUCTS_PER_SUBCAT = 5 # Products listed on L3 pages
|
||||
MAX_DEPTH_TARGET = 5 # Explicitly set target depth
|
||||
|
||||
# --- Helper Functions ---
|
||||
|
||||
def generate_lorem(words=20):
|
||||
"""Generates simple placeholder text."""
|
||||
lorem_words = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur",
|
||||
"adipiscing", "elit", "sed", "do", "eiusmod", "tempor",
|
||||
"incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]
|
||||
return " ".join(random.choice(lorem_words) for _ in range(words)).capitalize() + "."
|
||||
|
||||
def create_html_page(filepath: Path, title: str, body_content: str, breadcrumbs: list = [], head_extras: str = ""):
|
||||
"""Creates an HTML file with basic structure and inline CSS."""
|
||||
os.makedirs(filepath.parent, exist_ok=True)
|
||||
|
||||
# Generate breadcrumb HTML using the 'link' provided in the breadcrumbs list
|
||||
breadcrumb_html = ""
|
||||
if breadcrumbs:
|
||||
links_html = " » ".join(f'<a href="{bc["link"]}">{bc["name"]}</a>' for bc in breadcrumbs)
|
||||
breadcrumb_html = f"<nav class='breadcrumbs'>{links_html} » {title}</nav>"
|
||||
|
||||
# Basic CSS for structure identification (kept the same)
|
||||
css = """
|
||||
<style>
|
||||
body {
|
||||
font-family: sans-serif;
|
||||
padding: 20px;
|
||||
background-color: #1e1e1e;
|
||||
color: #d1d1d1;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 960px;
|
||||
margin: auto;
|
||||
background: #2c2c2c;
|
||||
padding: 20px;
|
||||
border-radius: 5px;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.5);
|
||||
}
|
||||
|
||||
h1, h2 {
|
||||
color: #ccc;
|
||||
}
|
||||
|
||||
a {
|
||||
color: #9bcdff;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
ul {
|
||||
list-style: none;
|
||||
padding-left: 0;
|
||||
}
|
||||
|
||||
li {
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.category-link,
|
||||
.subcategory-link,
|
||||
.product-link,
|
||||
.details-link,
|
||||
.reviews-link {
|
||||
display: block;
|
||||
padding: 8px;
|
||||
background-color: #3a3a3a;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.product-preview {
|
||||
border: 1px solid #444;
|
||||
padding: 10px;
|
||||
margin-bottom: 10px;
|
||||
border-radius: 4px;
|
||||
background-color: #2a2a2a;
|
||||
}
|
||||
|
||||
.product-title {
|
||||
color: #d1d1d1;
|
||||
}
|
||||
|
||||
.product-price {
|
||||
font-weight: bold;
|
||||
color: #85e085;
|
||||
}
|
||||
|
||||
.product-description,
|
||||
.product-specs,
|
||||
.product-reviews {
|
||||
margin-top: 15px;
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
.product-specs li {
|
||||
margin-bottom: 5px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.spec-name {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.breadcrumbs {
|
||||
margin-bottom: 20px;
|
||||
font-size: 0.9em;
|
||||
color: #888;
|
||||
}
|
||||
|
||||
.breadcrumbs a {
|
||||
color: #9bcdff;
|
||||
}
|
||||
</style>
|
||||
"""
|
||||
html_content = f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>{title} - FakeShop</title>
|
||||
{head_extras}
|
||||
{css}
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
{breadcrumb_html}
|
||||
<h1>{title}</h1>
|
||||
{body_content}
|
||||
</div>
|
||||
</body>
|
||||
</html>"""
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(html_content)
|
||||
# Keep print statement concise for clarity
|
||||
# print(f"Created: {filepath}")
|
||||
|
||||
def generate_site(base_dir: Path, site_name: str = "FakeShop", base_path: str = ""):
|
||||
"""Generates the dummy website structure."""
|
||||
base_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# --- Clean and prepare the base path for URL construction ---
|
||||
# Ensure it starts with '/' if not empty, and remove any trailing '/'
|
||||
if base_path:
|
||||
full_base_path = "/" + base_path.strip('/')
|
||||
else:
|
||||
full_base_path = "" # Represents the root
|
||||
|
||||
print(f"Using base path for links: '{full_base_path}'")
|
||||
|
||||
# --- Level 0: Homepage ---
|
||||
home_body = "<h2>Welcome to FakeShop!</h2><p>Your one-stop shop for imaginary items.</p><h3>Categories:</h3>\n<ul>"
|
||||
# Define the *actual* link path for the homepage breadcrumb
|
||||
home_link_path = f"{full_base_path}/index.html"
|
||||
breadcrumbs_home = [{"name": "Home", "link": home_link_path}] # Base breadcrumb
|
||||
|
||||
# Links *within* the page content should remain relative
|
||||
for i in range(NUM_CATEGORIES):
|
||||
cat_name = f"Category-{i+1}"
|
||||
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
|
||||
# This path is relative to the current directory (index.html)
|
||||
cat_relative_page_path = f"{cat_folder_name}/index.html"
|
||||
home_body += f'<li><a class="category-link" href="{cat_relative_page_path}">{cat_name}</a> - {generate_lorem(10)}</li>'
|
||||
home_body += "</ul>"
|
||||
create_html_page(base_dir / "index.html", "Homepage", home_body, []) # No breadcrumbs *on* the homepage itself
|
||||
|
||||
# --- Levels 1-5 ---
|
||||
for i in range(NUM_CATEGORIES):
|
||||
cat_name = f"Category-{i+1}"
|
||||
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
|
||||
cat_dir = base_dir / cat_folder_name
|
||||
# This is the *absolute* path for the breadcrumb link
|
||||
cat_link_path = f"{full_base_path}/{cat_folder_name}/index.html"
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_cat = breadcrumbs_home + [{"name": cat_name, "link": cat_link_path}]
|
||||
|
||||
# --- Level 1: Category Page ---
|
||||
cat_body = f"<p>{generate_lorem(15)} for {cat_name}.</p><h3>Sub-Categories:</h3>\n<ul>"
|
||||
for j in range(NUM_SUBCATEGORIES_PER_CAT):
|
||||
subcat_name = f"{cat_name}-Sub-{j+1}"
|
||||
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
|
||||
# Path relative to the category page
|
||||
subcat_relative_page_path = f"{subcat_folder_name}/index.html"
|
||||
cat_body += f'<li><a class="subcategory-link" href="{subcat_relative_page_path}">{subcat_name}</a> - {generate_lorem(8)}</li>'
|
||||
cat_body += "</ul>"
|
||||
# Pass the updated breadcrumbs list
|
||||
create_html_page(cat_dir / "index.html", cat_name, cat_body, breadcrumbs_home) # Parent breadcrumb needed here
|
||||
|
||||
for j in range(NUM_SUBCATEGORIES_PER_CAT):
|
||||
subcat_name = f"{cat_name}-Sub-{j+1}"
|
||||
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
|
||||
subcat_dir = cat_dir / subcat_folder_name
|
||||
# Absolute path for the breadcrumb link
|
||||
subcat_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/index.html"
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_subcat = breadcrumbs_cat + [{"name": subcat_name, "link": subcat_link_path}]
|
||||
|
||||
# --- Level 2: Sub-Category Page (Product List) ---
|
||||
subcat_body = f"<p>Explore products in {subcat_name}. {generate_lorem(12)}</p><h3>Products:</h3>\n<ul class='product-list'>"
|
||||
for k in range(NUM_PRODUCTS_PER_SUBCAT):
|
||||
prod_id = f"P{i+1}{j+1}{k+1:03d}" # e.g., P11001
|
||||
prod_name = f"{subcat_name} Product {k+1} ({prod_id})"
|
||||
# Filename relative to the subcategory page
|
||||
prod_filename = f"product_{prod_id}.html"
|
||||
# Absolute path for the breadcrumb link
|
||||
prod_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{prod_filename}"
|
||||
|
||||
# Preview on list page (link remains relative)
|
||||
subcat_body += f"""
|
||||
<li>
|
||||
<div class="product-preview">
|
||||
<a class="product-link" href="{prod_filename}"><strong>{prod_name}</strong></a>
|
||||
<p>{generate_lorem(10)}</p>
|
||||
<span class="product-price">£{random.uniform(10, 500):.2f}</span>
|
||||
</div>
|
||||
</li>"""
|
||||
|
||||
# --- Level 3: Product Page ---
|
||||
prod_price = random.uniform(10, 500)
|
||||
prod_desc = generate_lorem(40)
|
||||
prod_specs = {f"Spec {s+1}": generate_lorem(3) for s in range(random.randint(3,6))}
|
||||
prod_reviews_count = random.randint(0, 150)
|
||||
# Relative filenames for links on this page
|
||||
details_filename_relative = f"product_{prod_id}_details.html"
|
||||
reviews_filename_relative = f"product_{prod_id}_reviews.html"
|
||||
|
||||
prod_body = f"""
|
||||
<p class="product-price">Price: £{prod_price:.2f}</p>
|
||||
<div class="product-description">
|
||||
<h2>Description</h2>
|
||||
<p>{prod_desc}</p>
|
||||
</div>
|
||||
<div class="product-specs">
|
||||
<h2>Specifications</h2>
|
||||
<ul>
|
||||
{''.join(f'<li><span class="spec-name">{name}</span>: <span class="spec-value">{value}</span></li>' for name, value in prod_specs.items())}
|
||||
</ul>
|
||||
</div>
|
||||
<div class="product-reviews">
|
||||
<h2>Reviews</h2>
|
||||
<p>Total Reviews: <span class="review-count">{prod_reviews_count}</span></p>
|
||||
</div>
|
||||
<hr>
|
||||
<p>
|
||||
<a class="details-link" href="{details_filename_relative}">View More Details</a> |
|
||||
<a class="reviews-link" href="{reviews_filename_relative}">See All Reviews</a>
|
||||
</p>
|
||||
"""
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_prod = breadcrumbs_subcat + [{"name": prod_name, "link": prod_link_path}]
|
||||
# Pass the updated breadcrumbs list
|
||||
create_html_page(subcat_dir / prod_filename, prod_name, prod_body, breadcrumbs_subcat) # Parent breadcrumb needed here
|
||||
|
||||
# --- Level 4: Product Details Page ---
|
||||
details_filename = f"product_{prod_id}_details.html" # Actual filename
|
||||
# Absolute path for the breadcrumb link
|
||||
details_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{details_filename}"
|
||||
details_body = f"<p>This page contains extremely detailed information about {prod_name}.</p>{generate_lorem(100)}"
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_details = breadcrumbs_prod + [{"name": "Details", "link": details_link_path}]
|
||||
# Pass the updated breadcrumbs list
|
||||
create_html_page(subcat_dir / details_filename, f"{prod_name} - Details", details_body, breadcrumbs_prod) # Parent breadcrumb needed here
|
||||
|
||||
# --- Level 5: Product Reviews Page ---
|
||||
reviews_filename = f"product_{prod_id}_reviews.html" # Actual filename
|
||||
# Absolute path for the breadcrumb link
|
||||
reviews_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{reviews_filename}"
|
||||
reviews_body = f"<p>All {prod_reviews_count} reviews for {prod_name} are listed here.</p><ul>"
|
||||
for r in range(prod_reviews_count):
|
||||
reviews_body += f"<li>Review {r+1}: {generate_lorem(random.randint(15, 50))}</li>"
|
||||
reviews_body += "</ul>"
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_reviews = breadcrumbs_prod + [{"name": "Reviews", "link": reviews_link_path}]
|
||||
# Pass the updated breadcrumbs list
|
||||
create_html_page(subcat_dir / reviews_filename, f"{prod_name} - Reviews", reviews_body, breadcrumbs_prod) # Parent breadcrumb needed here
|
||||
|
||||
|
||||
subcat_body += "</ul>" # Close product-list ul
|
||||
# Pass the correct breadcrumbs list for the subcategory index page
|
||||
create_html_page(subcat_dir / "index.html", subcat_name, subcat_body, breadcrumbs_cat) # Parent breadcrumb needed here
|
||||
|
||||
|
||||
# --- Main Execution ---
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Generate a dummy multi-level retail website.")
|
||||
parser.add_argument(
|
||||
"-o", "--output-dir",
|
||||
type=str,
|
||||
default="dummy_retail_site",
|
||||
help="Directory to generate the website in."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n", "--site-name",
|
||||
type=str,
|
||||
default="FakeShop",
|
||||
help="Name of the fake shop."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-b", "--base-path",
|
||||
type=str,
|
||||
default="",
|
||||
help="Base path for hosting the site (e.g., 'samples/deepcrawl'). Leave empty if hosted at the root."
|
||||
)
|
||||
# Optional: Add more args to configure counts if needed
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
output_directory = Path(args.output_dir)
|
||||
site_name = args.site_name
|
||||
base_path = args.base_path
|
||||
|
||||
print(f"Generating dummy site '{site_name}' in '{output_directory}'...")
|
||||
# Pass the base_path to the generation function
|
||||
generate_site(output_directory, site_name, base_path)
|
||||
print(f"\nCreated {sum(1 for _ in output_directory.rglob('*.html'))} HTML pages.")
|
||||
print("Dummy site generation complete.")
|
||||
print(f"To serve locally (example): python -m http.server --directory {output_directory} 8000")
|
||||
if base_path:
|
||||
print(f"Access the site at: http://localhost:8000/{base_path.strip('/')}/index.html")
|
||||
else:
|
||||
print(f"Access the site at: http://localhost:8000/index.html")
|
||||
56
tests/general/test_acyn_crawl_wuth_http_crawler_strategy.py
Normal file
56
tests/general/test_acyn_crawl_wuth_http_crawler_strategy.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
HTTPCrawlerConfig,
|
||||
CacheMode,
|
||||
DefaultMarkdownGenerator,
|
||||
PruningContentFilter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
async def main():
|
||||
# Initialize HTTP crawler strategy
|
||||
http_strategy = AsyncHTTPCrawlerStrategy(
|
||||
browser_config=HTTPCrawlerConfig(
|
||||
method="GET",
|
||||
verify_ssl=True,
|
||||
follow_redirects=True
|
||||
),
|
||||
logger=AsyncLogger(verbose=True)
|
||||
)
|
||||
|
||||
# Initialize web crawler with HTTP strategy
|
||||
async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48,
|
||||
threshold_type="fixed",
|
||||
min_word_threshold=0
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Test different URLs
|
||||
urls = [
|
||||
"https://example.com",
|
||||
"https://httpbin.org/get",
|
||||
"raw://<html><body>Test content</body></html>"
|
||||
]
|
||||
|
||||
for url in urls:
|
||||
print(f"\n=== Testing {url} ===")
|
||||
try:
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Raw HTML length: {len(result.html)}")
|
||||
if hasattr(result, 'markdown'):
|
||||
print(f"Markdown length: {len(result.markdown.raw_markdown)}")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
46
tests/general/test_advanced_deep_crawl.py
Normal file
46
tests/general/test_advanced_deep_crawl.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
|
||||
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||||
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||||
|
||||
|
||||
async def main():
|
||||
"""Example deep crawl of documentation site."""
|
||||
filter_chain = FilterChain([
|
||||
URLPatternFilter(patterns=["*2025*"]),
|
||||
DomainFilter(allowed_domains=["techcrunch.com"]),
|
||||
ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
|
||||
ContentTypeFilter(allowed_types=["text/html","application/javascript"])
|
||||
])
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy = BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
|
||||
),
|
||||
stream=False,
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
scraping_strategy=LXMLWebScrapingStrategy()
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
print("Starting deep crawl in streaming mode:")
|
||||
config.stream = True
|
||||
start_time = time.perf_counter()
|
||||
async for result in await crawler.arun(
|
||||
url="https://techcrunch.com",
|
||||
config=config
|
||||
):
|
||||
print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
|
||||
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
106
tests/general/test_content_source_parameter.py
Normal file
106
tests/general/test_content_source_parameter.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""
|
||||
Tests for the content_source parameter in markdown generation.
|
||||
"""
|
||||
import unittest
|
||||
import asyncio
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai.models import MarkdownGenerationResult
|
||||
|
||||
HTML_SAMPLE = """
|
||||
<html>
|
||||
<head><title>Test Page</title></head>
|
||||
<body>
|
||||
<h1>Test Content</h1>
|
||||
<p>This is a test paragraph.</p>
|
||||
<div class="container">
|
||||
<p>This is content within a container.</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
class TestContentSourceParameter(unittest.TestCase):
|
||||
"""Test cases for the content_source parameter in markdown generation."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures."""
|
||||
self.loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(self.loop)
|
||||
|
||||
def tearDown(self):
|
||||
"""Tear down test fixtures."""
|
||||
self.loop.close()
|
||||
|
||||
def test_default_content_source(self):
|
||||
"""Test that the default content_source is 'cleaned_html'."""
|
||||
# Can't directly instantiate abstract class, so just test DefaultMarkdownGenerator
|
||||
generator = DefaultMarkdownGenerator()
|
||||
self.assertEqual(generator.content_source, "cleaned_html")
|
||||
|
||||
def test_custom_content_source(self):
|
||||
"""Test that content_source can be customized."""
|
||||
generator = DefaultMarkdownGenerator(content_source="fit_html")
|
||||
self.assertEqual(generator.content_source, "fit_html")
|
||||
|
||||
@patch('crawl4ai.markdown_generation_strategy.CustomHTML2Text')
|
||||
def test_html_processing_using_input_html(self, mock_html2text):
|
||||
"""Test that generate_markdown uses input_html parameter."""
|
||||
# Setup mock
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.handle.return_value = "# Test Content\n\nThis is a test paragraph."
|
||||
mock_html2text.return_value = mock_instance
|
||||
|
||||
# Create generator and call generate_markdown
|
||||
generator = DefaultMarkdownGenerator()
|
||||
result = generator.generate_markdown(input_html="<h1>Test Content</h1><p>This is a test paragraph.</p>")
|
||||
|
||||
# Verify input_html was passed to HTML2Text handler
|
||||
mock_instance.handle.assert_called_once()
|
||||
# Get the first positional argument
|
||||
args, _ = mock_instance.handle.call_args
|
||||
self.assertEqual(args[0], "<h1>Test Content</h1><p>This is a test paragraph.</p>")
|
||||
|
||||
# Check result
|
||||
self.assertIsInstance(result, MarkdownGenerationResult)
|
||||
self.assertEqual(result.raw_markdown, "# Test Content\n\nThis is a test paragraph.")
|
||||
|
||||
def test_html_source_selection_logic(self):
|
||||
"""Test that the HTML source selection logic works correctly."""
|
||||
# We'll test the dispatch pattern directly to avoid async complexities
|
||||
|
||||
# Create test data
|
||||
raw_html = "<html><body><h1>Raw HTML</h1></body></html>"
|
||||
cleaned_html = "<html><body><h1>Cleaned HTML</h1></body></html>"
|
||||
fit_html = "<html><body><h1>Preprocessed HTML</h1></body></html>"
|
||||
|
||||
# Test the dispatch pattern
|
||||
html_source_selector = {
|
||||
"raw_html": lambda: raw_html,
|
||||
"cleaned_html": lambda: cleaned_html,
|
||||
"fit_html": lambda: fit_html,
|
||||
}
|
||||
|
||||
# Test Case 1: content_source="cleaned_html"
|
||||
source_lambda = html_source_selector.get("cleaned_html")
|
||||
self.assertEqual(source_lambda(), cleaned_html)
|
||||
|
||||
# Test Case 2: content_source="raw_html"
|
||||
source_lambda = html_source_selector.get("raw_html")
|
||||
self.assertEqual(source_lambda(), raw_html)
|
||||
|
||||
# Test Case 3: content_source="fit_html"
|
||||
source_lambda = html_source_selector.get("fit_html")
|
||||
self.assertEqual(source_lambda(), fit_html)
|
||||
|
||||
# Test Case 4: Invalid content_source falls back to cleaned_html
|
||||
source_lambda = html_source_selector.get("invalid_source", lambda: cleaned_html)
|
||||
self.assertEqual(source_lambda(), cleaned_html)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
17
tests/general/test_crawlers.py
Normal file
17
tests/general/test_crawlers.py
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
# example_usageexample_usageexample_usage# example_usage.py
|
||||
import asyncio
|
||||
from crawl4ai.crawlers import get_crawler
|
||||
|
||||
async def main():
|
||||
# Get the registered crawler
|
||||
example_crawler = get_crawler("example_site.content")
|
||||
|
||||
# Crawl example.com
|
||||
result = await example_crawler(url="https://example.com")
|
||||
|
||||
print(result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
46
tests/general/test_deep_crawl.py
Normal file
46
tests/general/test_deep_crawl.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
|
||||
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
||||
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||||
|
||||
|
||||
async def main():
|
||||
"""Example deep crawl of documentation site."""
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
include_external=False
|
||||
),
|
||||
stream=False,
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
scraping_strategy=LXMLWebScrapingStrategy()
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
start_time = time.perf_counter()
|
||||
print("\nStarting deep crawl in batch mode:")
|
||||
results = await crawler.arun(
|
||||
url="https://docs.crawl4ai.com",
|
||||
config=config
|
||||
)
|
||||
print(f"Crawled {len(results)} pages")
|
||||
print(f"Example page: {results[0].url}")
|
||||
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")
|
||||
|
||||
print("Starting deep crawl in streaming mode:")
|
||||
config.stream = True
|
||||
start_time = time.perf_counter()
|
||||
async for result in await crawler.arun(
|
||||
url="https://docs.crawl4ai.com",
|
||||
config=config
|
||||
):
|
||||
print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
|
||||
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
279
tests/general/test_deep_crawl_filters.py
Normal file
279
tests/general/test_deep_crawl_filters.py
Normal file
@@ -0,0 +1,279 @@
|
||||
from crawl4ai.deep_crawling.filters import ContentRelevanceFilter, URLPatternFilter, DomainFilter, ContentTypeFilter, SEOFilter
|
||||
async def test_pattern_filter():
|
||||
# Test cases as list of tuples instead of dict for multiple patterns
|
||||
test_cases = [
|
||||
# Simple suffix patterns (*.html)
|
||||
("*.html", {
|
||||
"https://example.com/page.html": True,
|
||||
"https://example.com/path/doc.html": True,
|
||||
"https://example.com/page.htm": False,
|
||||
"https://example.com/page.html?param=1": True,
|
||||
}),
|
||||
|
||||
# Path prefix patterns (/foo/*)
|
||||
("*/article/*", {
|
||||
"https://example.com/article/123": True,
|
||||
"https://example.com/blog/article/456": True,
|
||||
"https://example.com/articles/789": False,
|
||||
"https://example.com/article": False,
|
||||
}),
|
||||
|
||||
# Complex patterns
|
||||
("blog-*-[0-9]", {
|
||||
"https://example.com/blog-post-1": True,
|
||||
"https://example.com/blog-test-9": True,
|
||||
"https://example.com/blog-post": False,
|
||||
"https://example.com/blog-post-x": False,
|
||||
}),
|
||||
|
||||
# Multiple patterns case
|
||||
(["*.pdf", "*/download/*"], {
|
||||
"https://example.com/doc.pdf": True,
|
||||
"https://example.com/download/file.txt": True,
|
||||
"https://example.com/path/download/doc": True,
|
||||
"https://example.com/uploads/file.txt": False,
|
||||
}),
|
||||
|
||||
# Edge cases
|
||||
("*", {
|
||||
"https://example.com": True,
|
||||
"": True,
|
||||
"http://test.com/path": True,
|
||||
}),
|
||||
|
||||
# Complex regex
|
||||
(r"^https?://.*\.example\.com/\d+", {
|
||||
"https://sub.example.com/123": True,
|
||||
"http://test.example.com/456": True,
|
||||
"https://example.com/789": False,
|
||||
"https://sub.example.com/abc": False,
|
||||
})
|
||||
]
|
||||
|
||||
def run_accuracy_test():
|
||||
print("\nAccuracy Tests:")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for patterns, test_urls in test_cases:
|
||||
filter_obj = URLPatternFilter(patterns)
|
||||
|
||||
for url, expected in test_urls.items():
|
||||
result = filter_obj.apply(url)
|
||||
if result != expected:
|
||||
print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {result}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
|
||||
|
||||
return all_passed
|
||||
|
||||
# Run tests
|
||||
print("Running Pattern Filter Tests...")
|
||||
accuracy_passed = run_accuracy_test()
|
||||
|
||||
if accuracy_passed:
|
||||
print("\n✨ All accuracy tests passed!")
|
||||
|
||||
else:
|
||||
print("\n❌ Some accuracy tests failed!")
|
||||
|
||||
async def test_domain_filter():
|
||||
from itertools import chain
|
||||
|
||||
# Test cases
|
||||
test_cases = [
|
||||
# Allowed domains
|
||||
({"allowed": "example.com"}, {
|
||||
"https://example.com/page": True,
|
||||
"http://example.com": True,
|
||||
"https://sub.example.com": False,
|
||||
"https://other.com": False,
|
||||
}),
|
||||
|
||||
({"allowed": ["example.com", "test.com"]}, {
|
||||
"https://example.com/page": True,
|
||||
"https://test.com/home": True,
|
||||
"https://other.com": False,
|
||||
}),
|
||||
|
||||
# Blocked domains
|
||||
({"blocked": "malicious.com"}, {
|
||||
"https://malicious.com": False,
|
||||
"https://safe.com": True,
|
||||
"http://malicious.com/login": False,
|
||||
}),
|
||||
|
||||
({"blocked": ["spam.com", "ads.com"]}, {
|
||||
"https://spam.com": False,
|
||||
"https://ads.com/banner": False,
|
||||
"https://example.com": True,
|
||||
}),
|
||||
|
||||
# Allowed and Blocked combination
|
||||
({"allowed": "example.com", "blocked": "sub.example.com"}, {
|
||||
"https://example.com": True,
|
||||
"https://sub.example.com": False,
|
||||
"https://other.com": False,
|
||||
}),
|
||||
]
|
||||
|
||||
def run_accuracy_test():
|
||||
print("\nAccuracy Tests:")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for params, test_urls in test_cases:
|
||||
filter_obj = DomainFilter(
|
||||
allowed_domains=params.get("allowed"),
|
||||
blocked_domains=params.get("blocked"),
|
||||
)
|
||||
|
||||
for url, expected in test_urls.items():
|
||||
result = filter_obj.apply(url)
|
||||
if result != expected:
|
||||
print(f"\u274C Failed: Params {params} with URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {result}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"\u2705 Passed: Params {params} with URL '{url}'")
|
||||
|
||||
return all_passed
|
||||
|
||||
# Run tests
|
||||
print("Running Domain Filter Tests...")
|
||||
accuracy_passed = run_accuracy_test()
|
||||
|
||||
if accuracy_passed:
|
||||
print("\n\u2728 All accuracy tests passed!")
|
||||
else:
|
||||
print("\n\u274C Some accuracy tests failed!")
|
||||
|
||||
async def test_content_relevance_filter():
|
||||
relevance_filter = ContentRelevanceFilter(
|
||||
query="What was the cause of american civil war?",
|
||||
threshold=1
|
||||
)
|
||||
|
||||
test_cases = {
|
||||
"https://en.wikipedia.org/wiki/Cricket": False,
|
||||
"https://en.wikipedia.org/wiki/American_Civil_War": True,
|
||||
}
|
||||
|
||||
print("\nRunning Content Relevance Filter Tests...")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for url, expected in test_cases.items():
|
||||
result = await relevance_filter.apply(url)
|
||||
if result != expected:
|
||||
print(f"\u274C Failed: URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {result}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"\u2705 Passed: URL '{url}'")
|
||||
|
||||
if all_passed:
|
||||
print("\n\u2728 All content relevance tests passed!")
|
||||
else:
|
||||
print("\n\u274C Some content relevance tests failed!")
|
||||
|
||||
async def test_content_type_filter():
|
||||
from itertools import chain
|
||||
|
||||
# Test cases
|
||||
test_cases = [
|
||||
# Allowed single type
|
||||
({"allowed": "image/png"}, {
|
||||
"https://example.com/image.png": True,
|
||||
"https://example.com/photo.jpg": False,
|
||||
"https://example.com/document.pdf": False,
|
||||
}),
|
||||
|
||||
# Multiple allowed types
|
||||
({"allowed": ["image/jpeg", "application/pdf"]}, {
|
||||
"https://example.com/photo.jpg": True,
|
||||
"https://example.com/document.pdf": True,
|
||||
"https://example.com/script.js": False,
|
||||
}),
|
||||
|
||||
# No extension should be allowed
|
||||
({"allowed": "application/json"}, {
|
||||
"https://example.com/api/data": True,
|
||||
"https://example.com/data.json": True,
|
||||
"https://example.com/page.html": False,
|
||||
}),
|
||||
|
||||
# Unknown extensions should not be allowed
|
||||
({"allowed": "application/octet-stream"}, {
|
||||
"https://example.com/file.unknown": True,
|
||||
"https://example.com/archive.zip": False,
|
||||
"https://example.com/software.exe": False,
|
||||
}),
|
||||
]
|
||||
|
||||
def run_accuracy_test():
|
||||
print("\nAccuracy Tests:")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for params, test_urls in test_cases:
|
||||
filter_obj = ContentTypeFilter(
|
||||
allowed_types=params.get("allowed"),
|
||||
)
|
||||
|
||||
for url, expected in test_urls.items():
|
||||
result = filter_obj.apply(url)
|
||||
if result != expected:
|
||||
print(f"\u274C Failed: Params {params} with URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {result}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"\u2705 Passed: Params {params} with URL '{url}'")
|
||||
|
||||
return all_passed
|
||||
|
||||
# Run tests
|
||||
print("Running Content Type Filter Tests...")
|
||||
accuracy_passed = run_accuracy_test()
|
||||
|
||||
if accuracy_passed:
|
||||
print("\n\u2728 All accuracy tests passed!")
|
||||
else:
|
||||
print("\n\u274C Some accuracy tests failed!")
|
||||
|
||||
async def test_seo_filter():
|
||||
seo_filter = SEOFilter(threshold=0.5, keywords=["SEO", "search engines", "Optimization"])
|
||||
|
||||
test_cases = {
|
||||
"https://en.wikipedia.org/wiki/Search_engine_optimization": True,
|
||||
"https://en.wikipedia.org/wiki/Randomness": False,
|
||||
}
|
||||
|
||||
print("\nRunning SEO Filter Tests...")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for url, expected in test_cases.items():
|
||||
result = await seo_filter.apply(url)
|
||||
if result != expected:
|
||||
print(f"\u274C Failed: URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {result}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"\u2705 Passed: URL '{url}'")
|
||||
|
||||
if all_passed:
|
||||
print("\n\u2728 All SEO filter tests passed!")
|
||||
else:
|
||||
print("\n\u274C Some SEO filter tests failed!")
|
||||
|
||||
import asyncio
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_pattern_filter())
|
||||
asyncio.run(test_domain_filter())
|
||||
asyncio.run(test_content_type_filter())
|
||||
asyncio.run(test_content_relevance_filter())
|
||||
asyncio.run(test_seo_filter())
|
||||
179
tests/general/test_deep_crawl_scorers.py
Normal file
179
tests/general/test_deep_crawl_scorers.py
Normal file
@@ -0,0 +1,179 @@
|
||||
from crawl4ai.deep_crawling.scorers import CompositeScorer, ContentTypeScorer, DomainAuthorityScorer, FreshnessScorer, KeywordRelevanceScorer, PathDepthScorer
|
||||
|
||||
|
||||
def test_scorers():
|
||||
test_cases = [
|
||||
# Keyword Scorer Tests
|
||||
{
|
||||
"scorer_type": "keyword",
|
||||
"config": {
|
||||
"keywords": ["python", "blog"],
|
||||
"weight": 1.0,
|
||||
"case_sensitive": False
|
||||
},
|
||||
"urls": {
|
||||
"https://example.com/python-blog": 1.0,
|
||||
"https://example.com/PYTHON-BLOG": 1.0,
|
||||
"https://example.com/python-only": 0.5,
|
||||
"https://example.com/other": 0.0
|
||||
}
|
||||
},
|
||||
|
||||
# Path Depth Scorer Tests
|
||||
{
|
||||
"scorer_type": "path_depth",
|
||||
"config": {
|
||||
"optimal_depth": 2,
|
||||
"weight": 1.0
|
||||
},
|
||||
"urls": {
|
||||
"https://example.com/a/b": 1.0,
|
||||
"https://example.com/a": 0.5,
|
||||
"https://example.com/a/b/c": 0.5,
|
||||
"https://example.com": 0.33333333
|
||||
}
|
||||
},
|
||||
|
||||
# Content Type Scorer Tests
|
||||
{
|
||||
"scorer_type": "content_type",
|
||||
"config": {
|
||||
"type_weights": {
|
||||
".html$": 1.0,
|
||||
".pdf$": 0.8,
|
||||
".jpg$": 0.6
|
||||
},
|
||||
"weight": 1.0
|
||||
},
|
||||
"urls": {
|
||||
"https://example.com/doc.html": 1.0,
|
||||
"https://example.com/doc.pdf": 0.8,
|
||||
"https://example.com/img.jpg": 0.6,
|
||||
"https://example.com/other.txt": 0.0
|
||||
}
|
||||
},
|
||||
|
||||
# Freshness Scorer Tests
|
||||
{
|
||||
"scorer_type": "freshness",
|
||||
"config": {
|
||||
"weight": 1.0, # Remove current_year since original doesn't support it
|
||||
},
|
||||
"urls": {
|
||||
"https://example.com/2024/01/post": 1.0,
|
||||
"https://example.com/2023/12/post": 0.9,
|
||||
"https://example.com/2022/post": 0.8,
|
||||
"https://example.com/no-date": 0.5
|
||||
}
|
||||
},
|
||||
|
||||
# Domain Authority Scorer Tests
|
||||
{
|
||||
"scorer_type": "domain",
|
||||
"config": {
|
||||
"domain_weights": {
|
||||
"python.org": 1.0,
|
||||
"github.com": 0.8,
|
||||
"medium.com": 0.6
|
||||
},
|
||||
"default_weight": 0.3,
|
||||
"weight": 1.0
|
||||
},
|
||||
"urls": {
|
||||
"https://python.org/about": 1.0,
|
||||
"https://github.com/repo": 0.8,
|
||||
"https://medium.com/post": 0.6,
|
||||
"https://unknown.com": 0.3
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
def create_scorer(scorer_type, config):
|
||||
if scorer_type == "keyword":
|
||||
return KeywordRelevanceScorer(**config)
|
||||
elif scorer_type == "path_depth":
|
||||
return PathDepthScorer(**config)
|
||||
elif scorer_type == "content_type":
|
||||
return ContentTypeScorer(**config)
|
||||
elif scorer_type == "freshness":
|
||||
return FreshnessScorer(**config,current_year=2024)
|
||||
elif scorer_type == "domain":
|
||||
return DomainAuthorityScorer(**config)
|
||||
|
||||
def run_accuracy_test():
|
||||
print("\nAccuracy Tests:")
|
||||
print("-" * 50)
|
||||
|
||||
all_passed = True
|
||||
for test_case in test_cases:
|
||||
print(f"\nTesting {test_case['scorer_type']} scorer:")
|
||||
scorer = create_scorer(
|
||||
test_case['scorer_type'],
|
||||
test_case['config']
|
||||
)
|
||||
|
||||
for url, expected in test_case['urls'].items():
|
||||
score = round(scorer.score(url), 8)
|
||||
expected = round(expected, 8)
|
||||
|
||||
if abs(score - expected) > 0.00001:
|
||||
print(f"❌ Scorer Failed: URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {score}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"✅ Scorer Passed: URL '{url}'")
|
||||
|
||||
|
||||
return all_passed
|
||||
|
||||
def run_composite_test():
|
||||
print("\nTesting Composite Scorer:")
|
||||
print("-" * 50)
|
||||
|
||||
# Create test data
|
||||
test_urls = {
|
||||
"https://python.org/blog/2024/01/new-release.html":0.86666667,
|
||||
"https://github.com/repo/old-code.pdf": 0.62,
|
||||
"https://unknown.com/random": 0.26
|
||||
}
|
||||
|
||||
# Create composite scorers with all types
|
||||
scorers = []
|
||||
|
||||
for test_case in test_cases:
|
||||
scorer = create_scorer(
|
||||
test_case['scorer_type'],
|
||||
test_case['config']
|
||||
)
|
||||
scorers.append(scorer)
|
||||
|
||||
composite = CompositeScorer(scorers, normalize=True)
|
||||
|
||||
all_passed = True
|
||||
for url, expected in test_urls.items():
|
||||
score = round(composite.score(url), 8)
|
||||
|
||||
if abs(score - expected) > 0.00001:
|
||||
print(f"❌ Composite Failed: URL '{url}'")
|
||||
print(f" Expected: {expected}, Got: {score}")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f"✅ Composite Passed: URL '{url}'")
|
||||
|
||||
return all_passed
|
||||
|
||||
# Run tests
|
||||
print("Running Scorer Tests...")
|
||||
accuracy_passed = run_accuracy_test()
|
||||
composite_passed = run_composite_test()
|
||||
|
||||
if accuracy_passed and composite_passed:
|
||||
print("\n✨ All tests passed!")
|
||||
# Note: Already have performance tests in run_scorer_performance_test()
|
||||
else:
|
||||
print("\n❌ Some tests failed!")
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_scorers()
|
||||
116
tests/general/test_http_crawler_strategy.py
Normal file
116
tests/general/test_http_crawler_strategy.py
Normal file
@@ -0,0 +1,116 @@
|
||||
from tkinter import N
|
||||
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
from crawl4ai import CrawlerRunConfig, HTTPCrawlerConfig
|
||||
from crawl4ai.async_crawler_strategy import ConnectionTimeoutError
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
async def main():
|
||||
"""Test the AsyncHTTPCrawlerStrategy with various scenarios"""
|
||||
logger = AsyncLogger(verbose=True)
|
||||
|
||||
# Initialize the strategy with default HTTPCrawlerConfig
|
||||
crawler = AsyncHTTPCrawlerStrategy(
|
||||
browser_config=HTTPCrawlerConfig(),
|
||||
logger=logger
|
||||
)
|
||||
# Test 1: Basic HTTP GET
|
||||
print("\n=== Test 1: Basic HTTP GET ===")
|
||||
result = await crawler.crawl("https://example.com")
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Content length: {len(result.html)}")
|
||||
print(f"Headers: {dict(result.response_headers)}")
|
||||
|
||||
# Test 2: POST request with JSON
|
||||
print("\n=== Test 2: POST with JSON ===")
|
||||
crawler.browser_config = crawler.browser_config.clone(
|
||||
method="POST",
|
||||
json={"test": "data"},
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
try:
|
||||
result = await crawler.crawl(
|
||||
"https://httpbin.org/post",
|
||||
)
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Response: {result.html[:200]}...")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
# Test 3: File handling
|
||||
crawler.browser_config = HTTPCrawlerConfig()
|
||||
print("\n=== Test 3: Local file handling ===")
|
||||
# Create a tmp file with test content
|
||||
from tempfile import NamedTemporaryFile
|
||||
with NamedTemporaryFile(delete=False) as f:
|
||||
f.write(b"<html><body>Test content</body></html>")
|
||||
f.close()
|
||||
result = await crawler.crawl(f"file://{f.name}")
|
||||
print(f"File content: {result.html}")
|
||||
|
||||
# Test 4: Raw content
|
||||
print("\n=== Test 4: Raw content handling ===")
|
||||
raw_html = "raw://<html><body>Raw test content</body></html>"
|
||||
result = await crawler.crawl(raw_html)
|
||||
print(f"Raw content: {result.html}")
|
||||
|
||||
# Test 5: Custom hooks
|
||||
print("\n=== Test 5: Custom hooks ===")
|
||||
async def before_request(url, kwargs):
|
||||
print(f"Before request to {url}")
|
||||
kwargs['headers']['X-Custom'] = 'test'
|
||||
|
||||
async def after_request(response):
|
||||
print(f"After request, status: {response.status_code}")
|
||||
|
||||
crawler.set_hook('before_request', before_request)
|
||||
crawler.set_hook('after_request', after_request)
|
||||
result = await crawler.crawl("https://example.com")
|
||||
|
||||
# Test 6: Error handling
|
||||
print("\n=== Test 6: Error handling ===")
|
||||
try:
|
||||
await crawler.crawl("https://nonexistent.domain.test")
|
||||
except Exception as e:
|
||||
print(f"Expected error: {e}")
|
||||
|
||||
# Test 7: Redirects
|
||||
print("\n=== Test 7: Redirect handling ===")
|
||||
crawler.browser_config = HTTPCrawlerConfig(follow_redirects=True)
|
||||
result = await crawler.crawl("http://httpbin.org/redirect/1")
|
||||
print(f"Final URL: {result.redirected_url}")
|
||||
|
||||
# Test 8: Custom timeout
|
||||
print("\n=== Test 8: Custom timeout ===")
|
||||
try:
|
||||
await crawler.crawl(
|
||||
"https://httpbin.org/delay/5",
|
||||
config=CrawlerRunConfig(page_timeout=2)
|
||||
)
|
||||
except ConnectionTimeoutError as e:
|
||||
print(f"Expected timeout: {e}")
|
||||
|
||||
# Test 9: SSL verification
|
||||
print("\n=== Test 9: SSL verification ===")
|
||||
crawler.browser_config = HTTPCrawlerConfig(verify_ssl=False)
|
||||
try:
|
||||
await crawler.crawl("https://expired.badssl.com/")
|
||||
print("Connected to invalid SSL site with verification disabled")
|
||||
except Exception as e:
|
||||
print(f"SSL error: {e}")
|
||||
|
||||
# Test 10: Large file streaming
|
||||
print("\n=== Test 10: Large file streaming ===")
|
||||
from tempfile import NamedTemporaryFile
|
||||
with NamedTemporaryFile(delete=False) as f:
|
||||
f.write(b"<html><body>" + b"X" * 1024 * 1024 * 10 + b"</body></html>")
|
||||
f.close()
|
||||
result = await crawler.crawl("file://" + f.name)
|
||||
print(f"Large file content length: {len(result.html)}")
|
||||
os.remove(f.name)
|
||||
|
||||
crawler.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def test_llm_filter():
|
||||
@@ -22,8 +23,7 @@ async def test_llm_filter():
|
||||
|
||||
# Initialize LLM filter with focused instruction
|
||||
filter = LLMContentFilter(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
instruction="""
|
||||
Focus on extracting the core educational content about Python classes.
|
||||
Include:
|
||||
@@ -43,8 +43,7 @@ async def test_llm_filter():
|
||||
)
|
||||
|
||||
filter = LLMContentFilter(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
||||
instruction="""
|
||||
Extract the main educational content while preserving its original wording and substance completely. Your task is to:
|
||||
213
tests/general/test_mhtml.py
Normal file
213
tests/general/test_mhtml.py
Normal file
@@ -0,0 +1,213 @@
|
||||
# test_mhtml_capture.py
|
||||
|
||||
import pytest
|
||||
import asyncio
|
||||
import re # For more robust MHTML checks
|
||||
|
||||
# Assuming these can be imported directly from the crawl4ai library
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CrawlResult
|
||||
|
||||
# A reliable, simple static HTML page for testing
|
||||
# Using httpbin as it's designed for testing clients
|
||||
TEST_URL_SIMPLE = "https://httpbin.org/html"
|
||||
EXPECTED_CONTENT_SIMPLE = "Herman Melville - Moby-Dick"
|
||||
|
||||
# A slightly more complex page that might involve JS (good secondary test)
|
||||
TEST_URL_JS = "https://quotes.toscrape.com/js/"
|
||||
EXPECTED_CONTENT_JS = "Quotes to Scrape" # Title of the page, which should be present in MHTML
|
||||
|
||||
# Removed the custom event_loop fixture as pytest-asyncio provides a default one.
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_mhtml_capture_when_enabled():
|
||||
"""
|
||||
Verify that when CrawlerRunConfig has capture_mhtml=True,
|
||||
the CrawlResult contains valid MHTML content.
|
||||
"""
|
||||
# Create a fresh browser config and crawler instance for this test
|
||||
browser_config = BrowserConfig(headless=True) # Use headless for testing CI/CD
|
||||
# --- Key: Enable MHTML capture in the run config ---
|
||||
run_config = CrawlerRunConfig(capture_mhtml=True)
|
||||
|
||||
# Create a fresh crawler instance
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await crawler.start()
|
||||
|
||||
# Perform the crawl with the MHTML-enabled config
|
||||
result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
|
||||
|
||||
# --- Assertions ---
|
||||
assert result is not None, "Crawler should return a result object"
|
||||
assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
|
||||
|
||||
# 1. Check if the mhtml attribute exists (will fail if CrawlResult not updated)
|
||||
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
|
||||
|
||||
# 2. Check if mhtml is populated
|
||||
assert result.mhtml is not None, "MHTML content should be captured when enabled"
|
||||
assert isinstance(result.mhtml, str), "MHTML content should be a string"
|
||||
assert len(result.mhtml) > 500, "MHTML content seems too short, likely invalid" # Basic sanity check
|
||||
|
||||
# 3. Check for MHTML structure indicators (more robust than simple string contains)
|
||||
# MHTML files are multipart MIME messages
|
||||
assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE), \
|
||||
"MHTML should contain 'Content-Type: multipart/related;'"
|
||||
# Should contain a boundary definition
|
||||
assert re.search(r"boundary=\"----MultipartBoundary", result.mhtml), \
|
||||
"MHTML should contain a multipart boundary"
|
||||
# Should contain the main HTML part
|
||||
assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE), \
|
||||
"MHTML should contain a 'Content-Type: text/html' part"
|
||||
|
||||
# 4. Check if the *actual page content* is within the MHTML string
|
||||
# This confirms the snapshot captured the rendered page
|
||||
assert EXPECTED_CONTENT_SIMPLE in result.mhtml, \
|
||||
f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the captured MHTML"
|
||||
|
||||
# 5. Ensure standard HTML is still present and correct
|
||||
assert result.html is not None, "Standard HTML should still be present"
|
||||
assert isinstance(result.html, str), "Standard HTML should be a string"
|
||||
assert EXPECTED_CONTENT_SIMPLE in result.html, \
|
||||
f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the standard HTML"
|
||||
|
||||
finally:
|
||||
# Important: Ensure browser is completely closed even if assertions fail
|
||||
await crawler.close()
|
||||
# Help the garbage collector clean up
|
||||
crawler = None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_mhtml_capture_when_disabled_explicitly():
|
||||
"""
|
||||
Verify that when CrawlerRunConfig explicitly has capture_mhtml=False,
|
||||
the CrawlResult.mhtml attribute is None.
|
||||
"""
|
||||
# Create a fresh browser config and crawler instance for this test
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
# --- Key: Explicitly disable MHTML capture ---
|
||||
run_config = CrawlerRunConfig(capture_mhtml=False)
|
||||
|
||||
# Create a fresh crawler instance
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await crawler.start()
|
||||
result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
|
||||
|
||||
assert result is not None
|
||||
assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
|
||||
|
||||
# 1. Check attribute existence (important for TDD start)
|
||||
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
|
||||
|
||||
# 2. Check mhtml is None
|
||||
assert result.mhtml is None, "MHTML content should be None when explicitly disabled"
|
||||
|
||||
# 3. Ensure standard HTML is still present
|
||||
assert result.html is not None
|
||||
assert EXPECTED_CONTENT_SIMPLE in result.html
|
||||
|
||||
finally:
|
||||
# Important: Ensure browser is completely closed even if assertions fail
|
||||
await crawler.close()
|
||||
# Help the garbage collector clean up
|
||||
crawler = None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_mhtml_capture_when_disabled_by_default():
|
||||
"""
|
||||
Verify that if capture_mhtml is not specified (using its default),
|
||||
the CrawlResult.mhtml attribute is None.
|
||||
(This assumes the default value for capture_mhtml in CrawlerRunConfig is False)
|
||||
"""
|
||||
# Create a fresh browser config and crawler instance for this test
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
# --- Key: Use default run config ---
|
||||
run_config = CrawlerRunConfig() # Do not specify capture_mhtml
|
||||
|
||||
# Create a fresh crawler instance
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await crawler.start()
|
||||
result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
|
||||
|
||||
assert result is not None
|
||||
assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
|
||||
|
||||
# 1. Check attribute existence
|
||||
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
|
||||
|
||||
# 2. Check mhtml is None (assuming default is False)
|
||||
assert result.mhtml is None, "MHTML content should be None when using default config (assuming default=False)"
|
||||
|
||||
# 3. Ensure standard HTML is still present
|
||||
assert result.html is not None
|
||||
assert EXPECTED_CONTENT_SIMPLE in result.html
|
||||
|
||||
finally:
|
||||
# Important: Ensure browser is completely closed even if assertions fail
|
||||
await crawler.close()
|
||||
# Help the garbage collector clean up
|
||||
crawler = None
|
||||
|
||||
# Optional: Add a test for a JS-heavy page if needed
|
||||
@pytest.mark.asyncio
|
||||
async def test_mhtml_capture_on_js_page_when_enabled():
|
||||
"""
|
||||
Verify MHTML capture works on a page requiring JavaScript execution.
|
||||
"""
|
||||
# Create a fresh browser config and crawler instance for this test
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
run_config = CrawlerRunConfig(
|
||||
capture_mhtml=True,
|
||||
# Add a small wait or JS execution if needed for the JS page to fully render
|
||||
# For quotes.toscrape.com/js/, it renders quickly, but a wait might be safer
|
||||
# wait_for_timeout=2000 # Example: wait up to 2 seconds
|
||||
js_code="await new Promise(r => setTimeout(r, 500));" # Small delay after potential load
|
||||
)
|
||||
|
||||
# Create a fresh crawler instance
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await crawler.start()
|
||||
result: CrawlResult = await crawler.arun(TEST_URL_JS, config=run_config)
|
||||
|
||||
assert result is not None
|
||||
assert result.success is True, f"Crawling {TEST_URL_JS} should succeed. Error: {result.error_message}"
|
||||
assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
|
||||
assert result.mhtml is not None, "MHTML content should be captured on JS page when enabled"
|
||||
assert isinstance(result.mhtml, str), "MHTML content should be a string"
|
||||
assert len(result.mhtml) > 500, "MHTML content from JS page seems too short"
|
||||
|
||||
# Check for MHTML structure
|
||||
assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE)
|
||||
assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE)
|
||||
|
||||
# Check for content rendered by JS within the MHTML
|
||||
assert EXPECTED_CONTENT_JS in result.mhtml, \
|
||||
f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the captured MHTML"
|
||||
|
||||
# Check standard HTML too
|
||||
assert result.html is not None
|
||||
assert EXPECTED_CONTENT_JS in result.html, \
|
||||
f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the standard HTML"
|
||||
|
||||
finally:
|
||||
# Important: Ensure browser is completely closed even if assertions fail
|
||||
await crawler.close()
|
||||
# Help the garbage collector clean up
|
||||
crawler = None
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Use pytest for async tests
|
||||
pytest.main(["-xvs", __file__])
|
||||
185
tests/general/test_network_console_capture.py
Normal file
185
tests/general/test_network_console_capture.py
Normal file
@@ -0,0 +1,185 @@
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from aiohttp import web
|
||||
import tempfile
|
||||
import shutil
|
||||
import os, sys, time, json
|
||||
|
||||
|
||||
async def start_test_server():
|
||||
app = web.Application()
|
||||
|
||||
async def basic_page(request):
|
||||
return web.Response(text="""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Network Request Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Test Page for Network Capture</h1>
|
||||
<p>This page performs network requests and console logging.</p>
|
||||
<img src="/image.png" alt="Test Image">
|
||||
<script>
|
||||
console.log("Basic console log");
|
||||
console.error("Error message");
|
||||
console.warn("Warning message");
|
||||
|
||||
// Make some XHR requests
|
||||
const xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', '/api/data', true);
|
||||
xhr.send();
|
||||
|
||||
// Make a fetch request
|
||||
fetch('/api/json')
|
||||
.then(response => response.json())
|
||||
.catch(error => console.error('Fetch error:', error));
|
||||
|
||||
// Trigger an error
|
||||
setTimeout(() => {
|
||||
try {
|
||||
nonExistentFunction();
|
||||
} catch (e) {
|
||||
console.error("Caught error:", e);
|
||||
}
|
||||
}, 100);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
""", content_type="text/html")
|
||||
|
||||
async def image(request):
|
||||
# Return a small 1x1 transparent PNG
|
||||
return web.Response(body=bytes.fromhex('89504E470D0A1A0A0000000D49484452000000010000000108060000001F15C4890000000D4944415478DA63FAFFFF3F030079DB00018D959DE70000000049454E44AE426082'), content_type="image/png")
|
||||
|
||||
async def api_data(request):
|
||||
return web.Response(text="sample data")
|
||||
|
||||
async def api_json(request):
|
||||
return web.json_response({"status": "success", "message": "JSON data"})
|
||||
|
||||
# Register routes
|
||||
app.router.add_get('/', basic_page)
|
||||
app.router.add_get('/image.png', image)
|
||||
app.router.add_get('/api/data', api_data)
|
||||
app.router.add_get('/api/json', api_json)
|
||||
|
||||
runner = web.AppRunner(app)
|
||||
await runner.setup()
|
||||
site = web.TCPSite(runner, 'localhost', 8080)
|
||||
await site.start()
|
||||
|
||||
return runner
|
||||
|
||||
|
||||
async def test_network_console_capture():
|
||||
print("\n=== Testing Network and Console Capture ===\n")
|
||||
|
||||
# Start test server
|
||||
runner = await start_test_server()
|
||||
try:
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# Test with capture disabled (default)
|
||||
print("\n1. Testing with capture disabled (default)...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle", # Wait for network to be idle
|
||||
)
|
||||
result = await crawler.arun(url="http://localhost:8080/", config=config)
|
||||
|
||||
assert result.network_requests is None, "Network requests should be None when capture is disabled"
|
||||
assert result.console_messages is None, "Console messages should be None when capture is disabled"
|
||||
print("✓ Default config correctly returns None for network_requests and console_messages")
|
||||
|
||||
# Test with network capture enabled
|
||||
print("\n2. Testing with network capture enabled...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle", # Wait for network to be idle
|
||||
capture_network_requests=True
|
||||
)
|
||||
result = await crawler.arun(url="http://localhost:8080/", config=config)
|
||||
|
||||
assert result.network_requests is not None, "Network requests should be captured"
|
||||
print(f"✓ Captured {len(result.network_requests)} network requests")
|
||||
|
||||
# Check if we have both requests and responses
|
||||
request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
|
||||
response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
|
||||
print(f" - {request_count} requests, {response_count} responses")
|
||||
|
||||
# Check if we captured specific resources
|
||||
urls = [r.get("url") for r in result.network_requests]
|
||||
has_image = any("/image.png" in url for url in urls)
|
||||
has_api_data = any("/api/data" in url for url in urls)
|
||||
has_api_json = any("/api/json" in url for url in urls)
|
||||
|
||||
assert has_image, "Should have captured image request"
|
||||
assert has_api_data, "Should have captured API data request"
|
||||
assert has_api_json, "Should have captured API JSON request"
|
||||
print("✓ Captured expected network requests (image, API endpoints)")
|
||||
|
||||
# Test with console capture enabled
|
||||
print("\n3. Testing with console capture enabled...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle", # Wait for network to be idle
|
||||
capture_console_messages=True
|
||||
)
|
||||
result = await crawler.arun(url="http://localhost:8080/", config=config)
|
||||
|
||||
assert result.console_messages is not None, "Console messages should be captured"
|
||||
print(f"✓ Captured {len(result.console_messages)} console messages")
|
||||
|
||||
# Check if we have different types of console messages
|
||||
message_types = set(msg.get("type") for msg in result.console_messages if "type" in msg)
|
||||
print(f" - Message types: {', '.join(message_types)}")
|
||||
|
||||
# Print all captured messages for debugging
|
||||
print(" - Captured messages:")
|
||||
for msg in result.console_messages:
|
||||
print(f" * Type: {msg.get('type', 'N/A')}, Text: {msg.get('text', 'N/A')}")
|
||||
|
||||
# Look for specific messages
|
||||
messages = [msg.get("text") for msg in result.console_messages if "text" in msg]
|
||||
has_basic_log = any("Basic console log" in msg for msg in messages)
|
||||
has_error_msg = any("Error message" in msg for msg in messages)
|
||||
has_warning_msg = any("Warning message" in msg for msg in messages)
|
||||
|
||||
assert has_basic_log, "Should have captured basic console.log message"
|
||||
assert has_error_msg, "Should have captured console.error message"
|
||||
assert has_warning_msg, "Should have captured console.warn message"
|
||||
print("✓ Captured expected console messages (log, error, warning)")
|
||||
|
||||
# Test with both captures enabled
|
||||
print("\n4. Testing with both network and console capture enabled...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle", # Wait for network to be idle
|
||||
capture_network_requests=True,
|
||||
capture_console_messages=True
|
||||
)
|
||||
result = await crawler.arun(url="http://localhost:8080/", config=config)
|
||||
|
||||
assert result.network_requests is not None, "Network requests should be captured"
|
||||
assert result.console_messages is not None, "Console messages should be captured"
|
||||
print(f"✓ Successfully captured both {len(result.network_requests)} network requests and {len(result.console_messages)} console messages")
|
||||
|
||||
finally:
|
||||
await runner.cleanup()
|
||||
print("\nTest server shutdown")
|
||||
|
||||
|
||||
async def main():
|
||||
try:
|
||||
await test_network_console_capture()
|
||||
print("\n✅ All tests passed successfully!")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Test failed: {str(e)}")
|
||||
raise
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -10,6 +10,7 @@ import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||
from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
|
||||
import json
|
||||
|
||||
# Test HTML - A complex job board with companies, departments, and positions
|
||||
34
tests/hub/test_simple.py
Normal file
34
tests/hub/test_simple.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# test.py
|
||||
from crawl4ai import CrawlerHub
|
||||
import json
|
||||
|
||||
async def amazon_example():
|
||||
if (crawler_cls := CrawlerHub.get("amazon_product")) :
|
||||
crawler = crawler_cls()
|
||||
print(f"Crawler version: {crawler_cls.meta['version']}")
|
||||
print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}")
|
||||
print(await crawler.run("https://amazon.com/test"))
|
||||
else:
|
||||
print("Crawler not found!")
|
||||
|
||||
async def google_example():
|
||||
# Get crawler dynamically
|
||||
crawler_cls = CrawlerHub.get("google_search")
|
||||
crawler = crawler_cls()
|
||||
|
||||
# Text search
|
||||
text_results = await crawler.run(
|
||||
query="apple inc",
|
||||
search_type="text",
|
||||
schema_cache_path="/Users/unclecode/.crawl4ai"
|
||||
)
|
||||
print(json.dumps(json.loads(text_results), indent=4))
|
||||
|
||||
# Image search
|
||||
# image_results = await crawler.run(query="apple inc", search_type="image")
|
||||
# print(image_results)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
# asyncio.run(amazon_example())
|
||||
asyncio.run(google_example())
|
||||
80
tests/loggers/test_logger.py
Normal file
80
tests/loggers/test_logger.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, AsyncLoggerBase
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
class AsyncFileLogger(AsyncLoggerBase):
|
||||
"""
|
||||
File-only asynchronous logger that writes logs to a specified file.
|
||||
"""
|
||||
|
||||
def __init__(self, log_file: str):
|
||||
"""
|
||||
Initialize the file logger.
|
||||
|
||||
Args:
|
||||
log_file: File path for logging
|
||||
"""
|
||||
self.log_file = log_file
|
||||
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
|
||||
|
||||
def _write_to_file(self, level: str, message: str, tag: str):
|
||||
"""Write a message to the log file."""
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
||||
with open(self.log_file, "a", encoding="utf-8") as f:
|
||||
f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
|
||||
|
||||
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
|
||||
"""Log a debug message to file."""
|
||||
self._write_to_file("DEBUG", message, tag)
|
||||
|
||||
def info(self, message: str, tag: str = "INFO", **kwargs):
|
||||
"""Log an info message to file."""
|
||||
self._write_to_file("INFO", message, tag)
|
||||
|
||||
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
|
||||
"""Log a success message to file."""
|
||||
self._write_to_file("SUCCESS", message, tag)
|
||||
|
||||
def warning(self, message: str, tag: str = "WARNING", **kwargs):
|
||||
"""Log a warning message to file."""
|
||||
self._write_to_file("WARNING", message, tag)
|
||||
|
||||
def error(self, message: str, tag: str = "ERROR", **kwargs):
|
||||
"""Log an error message to file."""
|
||||
self._write_to_file("ERROR", message, tag)
|
||||
|
||||
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
|
||||
"""Log URL fetch status to file."""
|
||||
status = "SUCCESS" if success else "FAILED"
|
||||
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
|
||||
self._write_to_file("URL_STATUS", message, tag)
|
||||
|
||||
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
|
||||
"""Log error status to file."""
|
||||
message = f"{url[:url_length]}... | Error: {error}"
|
||||
self._write_to_file("ERROR", message, tag)
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
crawler = AsyncWebCrawler(config=browser_config, logger=AsyncFileLogger("/Users/unclecode/devs/crawl4ai/.private/tmp/crawl.log"))
|
||||
await crawler.start()
|
||||
|
||||
try:
|
||||
crawl_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
# Use the crawler multiple times
|
||||
result = await crawler.arun(
|
||||
url='https://kidocode.com/',
|
||||
config=crawl_config
|
||||
)
|
||||
if result.success:
|
||||
print("First crawl - Raw Markdown Length:", len(result.markdown.raw_markdown))
|
||||
|
||||
finally:
|
||||
# Always ensure we close the crawler
|
||||
await crawler.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
119
tests/mcp/test_mcp_socket.py
Normal file
119
tests/mcp/test_mcp_socket.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# pip install "mcp-sdk[ws]" anyio
|
||||
import anyio, json
|
||||
from mcp.client.websocket import websocket_client
|
||||
from mcp.client.session import ClientSession
|
||||
|
||||
async def test_list():
|
||||
async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
|
||||
async with ClientSession(r, w) as s:
|
||||
await s.initialize()
|
||||
|
||||
print("tools :", [t.name for t in (await s.list_tools()).tools])
|
||||
print("resources :", [r.name for r in (await s.list_resources()).resources])
|
||||
print("templates :", [t.name for t in (await s.list_resource_templates()).resource_templates])
|
||||
|
||||
|
||||
async def test_crawl(s: ClientSession) -> None:
|
||||
"""Hit the @mcp_tool('crawl') endpoint."""
|
||||
res = await s.call_tool(
|
||||
"crawl",
|
||||
{
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {},
|
||||
"crawler_config": {},
|
||||
},
|
||||
)
|
||||
print("crawl →", json.loads(res.content[0].text))
|
||||
|
||||
|
||||
async def test_md(s: ClientSession) -> None:
|
||||
"""Hit the @mcp_tool('md') endpoint."""
|
||||
res = await s.call_tool(
|
||||
"md",
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"f": "fit", # or RAW, BM25, LLM
|
||||
"q": None,
|
||||
"c": "0",
|
||||
},
|
||||
)
|
||||
result = json.loads(res.content[0].text)
|
||||
print("md →", result['markdown'][:100], "...")
|
||||
|
||||
async def test_screenshot(s: ClientSession):
|
||||
res = await s.call_tool(
|
||||
"screenshot",
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"screenshot_wait_for": 1.0,
|
||||
},
|
||||
)
|
||||
png_b64 = json.loads(res.content[0].text)["screenshot"]
|
||||
print("screenshot →", png_b64[:60], "… (base64)")
|
||||
|
||||
|
||||
async def test_pdf(s: ClientSession):
|
||||
res = await s.call_tool(
|
||||
"pdf",
|
||||
{
|
||||
"url": "https://example.com",
|
||||
},
|
||||
)
|
||||
pdf_b64 = json.loads(res.content[0].text)["pdf"]
|
||||
print("pdf →", pdf_b64[:60], "… (base64)")
|
||||
|
||||
async def test_execute_js(s: ClientSession):
|
||||
# click the “More” link on Hacker News front page and wait 1 s
|
||||
res = await s.call_tool(
|
||||
"execute_js",
|
||||
{
|
||||
"url": "https://news.ycombinator.com/news",
|
||||
"js_code": [
|
||||
"await page.click('a.morelink')",
|
||||
"await page.waitForTimeout(1000)",
|
||||
],
|
||||
},
|
||||
)
|
||||
crawl_result = json.loads(res.content[0].text)
|
||||
print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
|
||||
|
||||
async def test_html(s: ClientSession):
|
||||
# click the “More” link on Hacker News front page and wait 1 s
|
||||
res = await s.call_tool(
|
||||
"html",
|
||||
{
|
||||
"url": "https://news.ycombinator.com/news",
|
||||
},
|
||||
)
|
||||
crawl_result = json.loads(res.content[0].text)
|
||||
print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
|
||||
|
||||
async def test_context(s: ClientSession):
|
||||
# click the “More” link on Hacker News front page and wait 1 s
|
||||
res = await s.call_tool(
|
||||
"ask",
|
||||
{
|
||||
"query": "I hv a question about Crawl4ai library, how to extract internal links when crawling a page?"
|
||||
},
|
||||
)
|
||||
crawl_result = json.loads(res.content[0].text)
|
||||
print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
async with websocket_client("ws://localhost:11235/mcp/ws") as (r, w):
|
||||
async with ClientSession(r, w) as s:
|
||||
await s.initialize() # handshake
|
||||
tools = (await s.list_tools()).tools
|
||||
print("tools:", [t.name for t in tools])
|
||||
|
||||
# await test_list()
|
||||
await test_crawl(s)
|
||||
await test_md(s)
|
||||
await test_screenshot(s)
|
||||
await test_pdf(s)
|
||||
await test_execute_js(s)
|
||||
await test_html(s)
|
||||
await test_context(s)
|
||||
|
||||
anyio.run(main)
|
||||
11
tests/mcp/test_mcp_sse.py
Normal file
11
tests/mcp/test_mcp_sse.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from mcp.client.sse import sse_client
|
||||
from mcp.client.session import ClientSession
|
||||
|
||||
async def main():
|
||||
async with sse_client("http://127.0.0.1:8020/mcp") as (r, w):
|
||||
async with ClientSession(r, w) as sess:
|
||||
print(await sess.list_tools()) # now works
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
315
tests/memory/README.md
Normal file
315
tests/memory/README.md
Normal file
@@ -0,0 +1,315 @@
|
||||
# Crawl4AI Stress Testing and Benchmarking
|
||||
|
||||
This directory contains tools for stress testing Crawl4AI's `arun_many` method and dispatcher system with high volumes of URLs to evaluate performance, concurrency handling, and potentially detect memory issues. It also includes a benchmarking system to track performance over time.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Run a default stress test (small config) and generate a report
|
||||
# (Assumes run_all.sh is updated to call run_benchmark.py)
|
||||
./run_all.sh
|
||||
```
|
||||
*Note: `run_all.sh` might need to be updated if it directly called the old script.*
|
||||
|
||||
## Overview
|
||||
|
||||
The stress testing system works by:
|
||||
|
||||
1. Generating a local test site with heavy HTML pages (regenerated by default for each test).
|
||||
2. Starting a local HTTP server to serve these pages.
|
||||
3. Running Crawl4AI's `arun_many` method against this local site using the `MemoryAdaptiveDispatcher` with configurable concurrency (`max_sessions`).
|
||||
4. Monitoring performance metrics via the `CrawlerMonitor` and optionally logging memory usage.
|
||||
5. Optionally generating detailed benchmark reports with visualizations using `benchmark_report.py`.
|
||||
|
||||
## Available Tools
|
||||
|
||||
- `test_stress_sdk.py` - Main stress testing script utilizing `arun_many` and dispatchers.
|
||||
- `benchmark_report.py` - Report generator for comparing test results (assumes compatibility with `test_stress_sdk.py` outputs).
|
||||
- `run_benchmark.py` - Python script with predefined test configurations that orchestrates tests using `test_stress_sdk.py`.
|
||||
- `run_all.sh` - Simple wrapper script (may need updating).
|
||||
|
||||
## Usage Guide
|
||||
|
||||
### Using Predefined Configurations (Recommended)
|
||||
|
||||
The `run_benchmark.py` script offers the easiest way to run standardized tests:
|
||||
|
||||
```bash
|
||||
# Quick test (50 URLs, 4 max sessions)
|
||||
python run_benchmark.py quick
|
||||
|
||||
# Medium test (500 URLs, 16 max sessions)
|
||||
python run_benchmark.py medium
|
||||
|
||||
# Large test (1000 URLs, 32 max sessions)
|
||||
python run_benchmark.py large
|
||||
|
||||
# Extreme test (2000 URLs, 64 max sessions)
|
||||
python run_benchmark.py extreme
|
||||
|
||||
# Custom configuration
|
||||
python run_benchmark.py custom --urls 300 --max-sessions 24 --chunk-size 50
|
||||
|
||||
# Run 'small' test in streaming mode
|
||||
python run_benchmark.py small --stream
|
||||
|
||||
# Override max_sessions for the 'medium' config
|
||||
python run_benchmark.py medium --max-sessions 20
|
||||
|
||||
# Skip benchmark report generation after the test
|
||||
python run_benchmark.py small --no-report
|
||||
|
||||
# Clean up reports and site files before running
|
||||
python run_benchmark.py medium --clean
|
||||
```
|
||||
|
||||
#### `run_benchmark.py` Parameters
|
||||
|
||||
| Parameter | Default | Description |
|
||||
| -------------------- | --------------- | --------------------------------------------------------------------------- |
|
||||
| `config` | *required* | Test configuration: `quick`, `small`, `medium`, `large`, `extreme`, `custom`|
|
||||
| `--urls` | config-specific | Number of URLs (required for `custom`) |
|
||||
| `--max-sessions` | config-specific | Max concurrent sessions managed by dispatcher (required for `custom`) |
|
||||
| `--chunk-size` | config-specific | URLs per batch for non-stream logging (required for `custom`) |
|
||||
| `--stream` | False | Enable streaming results (disables batch logging) |
|
||||
| `--monitor-mode` | DETAILED | `DETAILED` or `AGGREGATED` display for the live monitor |
|
||||
| `--use-rate-limiter` | False | Enable basic rate limiter in the dispatcher |
|
||||
| `--port` | 8000 | HTTP server port |
|
||||
| `--no-report` | False | Skip generating comparison report via `benchmark_report.py` |
|
||||
| `--clean` | False | Clean up reports and site files before running |
|
||||
| `--keep-server-alive`| False | Keep local HTTP server running after test |
|
||||
| `--use-existing-site`| False | Use existing site on specified port (no local server start/site gen) |
|
||||
| `--skip-generation` | False | Use existing site files but start local server |
|
||||
| `--keep-site` | False | Keep generated site files after test |
|
||||
|
||||
#### Predefined Configurations
|
||||
|
||||
| Configuration | URLs | Max Sessions | Chunk Size | Description |
|
||||
| ------------- | ------ | ------------ | ---------- | -------------------------------- |
|
||||
| `quick` | 50 | 4 | 10 | Quick test for basic validation |
|
||||
| `small` | 100 | 8 | 20 | Small test for routine checks |
|
||||
| `medium` | 500 | 16 | 50 | Medium test for thorough checks |
|
||||
| `large` | 1000 | 32 | 100 | Large test for stress testing |
|
||||
| `extreme` | 2000 | 64 | 200 | Extreme test for limit testing |
|
||||
|
||||
### Direct Usage of `test_stress_sdk.py`
|
||||
|
||||
For fine-grained control or debugging, you can run the stress test script directly:
|
||||
|
||||
```bash
|
||||
# Test with 200 URLs and 32 max concurrent sessions
|
||||
python test_stress_sdk.py --urls 200 --max-sessions 32 --chunk-size 40
|
||||
|
||||
# Clean up previous test data first
|
||||
python test_stress_sdk.py --clean-reports --clean-site --urls 100 --max-sessions 16 --chunk-size 20
|
||||
|
||||
# Change the HTTP server port and use aggregated monitor
|
||||
python test_stress_sdk.py --port 8088 --urls 100 --max-sessions 16 --monitor-mode AGGREGATED
|
||||
|
||||
# Enable streaming mode and use rate limiting
|
||||
python test_stress_sdk.py --urls 50 --max-sessions 8 --stream --use-rate-limiter
|
||||
|
||||
# Change report output location
|
||||
python test_stress_sdk.py --report-path custom_reports --urls 100 --max-sessions 16
|
||||
```
|
||||
|
||||
#### `test_stress_sdk.py` Parameters
|
||||
|
||||
| Parameter | Default | Description |
|
||||
| -------------------- | ---------- | -------------------------------------------------------------------- |
|
||||
| `--urls` | 100 | Number of URLs to test |
|
||||
| `--max-sessions` | 16 | Maximum concurrent crawling sessions managed by the dispatcher |
|
||||
| `--chunk-size` | 10 | Number of URLs per batch (relevant for non-stream logging) |
|
||||
| `--stream` | False | Enable streaming results (disables batch logging) |
|
||||
| `--monitor-mode` | DETAILED | `DETAILED` or `AGGREGATED` display for the live `CrawlerMonitor` |
|
||||
| `--use-rate-limiter` | False | Enable a basic `RateLimiter` within the dispatcher |
|
||||
| `--site-path` | "test_site"| Path to store/use the generated test site |
|
||||
| `--port` | 8000 | Port for the local HTTP server |
|
||||
| `--report-path` | "reports" | Path to save test result summary (JSON) and memory samples (CSV) |
|
||||
| `--skip-generation` | False | Use existing test site files but still start local server |
|
||||
| `--use-existing-site`| False | Use existing site on specified port (no local server/site gen) |
|
||||
| `--keep-server-alive`| False | Keep local HTTP server running after test completion |
|
||||
| `--keep-site` | False | Keep the generated test site files after test completion |
|
||||
| `--clean-reports` | False | Clean up report directory before running |
|
||||
| `--clean-site` | False | Clean up site directory before/after running (see script logic) |
|
||||
|
||||
### Generating Reports Only
|
||||
|
||||
If you only want to generate a benchmark report from existing test results (assuming `benchmark_report.py` is compatible):
|
||||
|
||||
```bash
|
||||
# Generate a report from existing test results in ./reports/
|
||||
python benchmark_report.py
|
||||
|
||||
# Limit to the most recent 5 test results
|
||||
python benchmark_report.py --limit 5
|
||||
|
||||
# Specify a custom source directory for test results
|
||||
python benchmark_report.py --reports-dir alternate_results
|
||||
```
|
||||
|
||||
#### `benchmark_report.py` Parameters (Assumed)
|
||||
|
||||
| Parameter | Default | Description |
|
||||
| --------------- | -------------------- | ----------------------------------------------------------- |
|
||||
| `--reports-dir` | "reports" | Directory containing `test_stress_sdk.py` result files |
|
||||
| `--output-dir` | "benchmark_reports" | Directory to save generated HTML reports and charts |
|
||||
| `--limit` | None (all results) | Limit comparison to N most recent test results |
|
||||
| `--output-file` | Auto-generated | Custom output filename for the HTML report |
|
||||
|
||||
## Understanding the Test Output
|
||||
|
||||
### Real-time Progress Display (`CrawlerMonitor`)
|
||||
|
||||
When running `test_stress_sdk.py`, the `CrawlerMonitor` provides a live view of the crawling process managed by the dispatcher.
|
||||
|
||||
- **DETAILED Mode (Default):** Shows individual task status (Queued, Active, Completed, Failed), timings, memory usage per task (if `psutil` is available), overall queue statistics, and memory pressure status (if `psutil` available).
|
||||
- **AGGREGATED Mode:** Shows summary counts (Queued, Active, Completed, Failed), overall progress percentage, estimated time remaining, average URLs/sec, and memory pressure status.
|
||||
|
||||
### Batch Log Output (Non-Streaming Mode Only)
|
||||
|
||||
If running `test_stress_sdk.py` **without** the `--stream` flag, you will *also* see per-batch summary lines printed to the console *after* the monitor display, once each chunk of URLs finishes processing:
|
||||
|
||||
```
|
||||
Batch | Progress | Start Mem | End Mem | URLs/sec | Success/Fail | Time (s) | Status
|
||||
───────────────────────────────────────────────────────────────────────────────────────────
|
||||
1 | 10.0% | 50.1 MB | 55.3 MB | 23.8 | 10/0 | 0.42 | Success
|
||||
2 | 20.0% | 55.3 MB | 60.1 MB | 24.1 | 10/0 | 0.41 | Success
|
||||
...
|
||||
```
|
||||
|
||||
This display provides chunk-specific metrics:
|
||||
- **Batch**: The batch number being reported.
|
||||
- **Progress**: Overall percentage of total URLs processed *after* this batch.
|
||||
- **Start Mem / End Mem**: Memory usage before and after processing this batch (if tracked).
|
||||
- **URLs/sec**: Processing speed *for this specific batch*.
|
||||
- **Success/Fail**: Number of successful and failed URLs *in this batch*.
|
||||
- **Time (s)**: Wall-clock time taken to process *this batch*.
|
||||
- **Status**: Color-coded status for the batch outcome.
|
||||
|
||||
### Summary Output
|
||||
|
||||
After test completion, a final summary is displayed:
|
||||
|
||||
```
|
||||
================================================================================
|
||||
Test Completed
|
||||
================================================================================
|
||||
Test ID: 20250418_103015
|
||||
Configuration: 100 URLs, 16 max sessions, Chunk: 10, Stream: False, Monitor: DETAILED
|
||||
Results: 100 successful, 0 failed (100 processed, 100.0% success)
|
||||
Performance: 5.85 seconds total, 17.09 URLs/second avg
|
||||
Memory Usage: Start: 50.1 MB, End: 75.3 MB, Max: 78.1 MB, Growth: 25.2 MB
|
||||
Results summary saved to reports/test_summary_20250418_103015.json
|
||||
```
|
||||
|
||||
### HTML Report Structure (Generated by `benchmark_report.py`)
|
||||
|
||||
(This section remains the same, assuming `benchmark_report.py` generates these)
|
||||
The benchmark report contains several sections:
|
||||
1. **Summary**: Overview of the latest test results and trends
|
||||
2. **Performance Comparison**: Charts showing throughput across tests
|
||||
3. **Memory Usage**: Detailed memory usage graphs for each test
|
||||
4. **Detailed Results**: Tabular data of all test metrics
|
||||
5. **Conclusion**: Automated analysis of performance and memory patterns
|
||||
|
||||
### Memory Metrics
|
||||
|
||||
(This section remains conceptually the same)
|
||||
Memory growth is the key metric for detecting leaks...
|
||||
|
||||
### Performance Metrics
|
||||
|
||||
(This section remains conceptually the same, though "URLs per Worker" is less relevant - focus on overall URLs/sec)
|
||||
Key performance indicators include:
|
||||
- **URLs per Second**: Higher is better (throughput)
|
||||
- **Success Rate**: Should be 100% in normal conditions
|
||||
- **Total Processing Time**: Lower is better
|
||||
- **Dispatcher Efficiency**: Observe queue lengths and wait times in the monitor (Detailed mode)
|
||||
|
||||
### Raw Data Files
|
||||
|
||||
Raw data is saved in the `--report-path` directory (default `./reports/`):
|
||||
|
||||
- **JSON files** (`test_summary_*.json`): Contains the final summary for each test run.
|
||||
- **CSV files** (`memory_samples_*.csv`): Contains time-series memory samples taken during the test run.
|
||||
|
||||
Example of reading raw data:
|
||||
```python
|
||||
import json
|
||||
import pandas as pd
|
||||
|
||||
# Load test summary
|
||||
test_id = "20250418_103015" # Example ID
|
||||
with open(f'reports/test_summary_{test_id}.json', 'r') as f:
|
||||
results = json.load(f)
|
||||
|
||||
# Load memory samples
|
||||
memory_df = pd.read_csv(f'reports/memory_samples_{test_id}.csv')
|
||||
|
||||
# Analyze memory_df (e.g., calculate growth, plot)
|
||||
if not memory_df['memory_info_mb'].isnull().all():
|
||||
growth = memory_df['memory_info_mb'].iloc[-1] - memory_df['memory_info_mb'].iloc[0]
|
||||
print(f"Total Memory Growth: {growth:.1f} MB")
|
||||
else:
|
||||
print("No valid memory samples found.")
|
||||
|
||||
print(f"Avg URLs/sec: {results['urls_processed'] / results['total_time_seconds']:.2f}")
|
||||
```
|
||||
|
||||
## Visualization Dependencies
|
||||
|
||||
(This section remains the same)
|
||||
For full visualization capabilities in the HTML reports generated by `benchmark_report.py`, install additional dependencies...
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
benchmarking/ # Or your top-level directory name
|
||||
├── benchmark_reports/ # Generated HTML reports (by benchmark_report.py)
|
||||
├── reports/ # Raw test result data (from test_stress_sdk.py)
|
||||
├── test_site/ # Generated test content (temporary)
|
||||
├── benchmark_report.py# Report generator
|
||||
├── run_benchmark.py # Test runner with predefined configs
|
||||
├── test_stress_sdk.py # Main stress test implementation using arun_many
|
||||
└── run_all.sh # Simple wrapper script (may need updates)
|
||||
#└── requirements.txt # Optional: Visualization dependencies for benchmark_report.py
|
||||
```
|
||||
|
||||
## Cleanup
|
||||
|
||||
To clean up after testing:
|
||||
|
||||
```bash
|
||||
# Remove the test site content (if not using --keep-site)
|
||||
rm -rf test_site
|
||||
|
||||
# Remove all raw reports and generated benchmark reports
|
||||
rm -rf reports benchmark_reports
|
||||
|
||||
# Or use the --clean flag with run_benchmark.py
|
||||
python run_benchmark.py medium --clean
|
||||
```
|
||||
|
||||
## Use in CI/CD
|
||||
|
||||
(This section remains conceptually the same, just update script names)
|
||||
These tests can be integrated into CI/CD pipelines:
|
||||
```bash
|
||||
# Example CI script
|
||||
python run_benchmark.py medium --no-report # Run test without interactive report gen
|
||||
# Check exit code
|
||||
if [ $? -ne 0 ]; then echo "Stress test failed!"; exit 1; fi
|
||||
# Optionally, run report generator and check its output/metrics
|
||||
# python benchmark_report.py
|
||||
# check_report_metrics.py reports/test_summary_*.json || exit 1
|
||||
exit 0
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
- **HTTP Server Port Conflict**: Use `--port` with `run_benchmark.py` or `test_stress_sdk.py`.
|
||||
- **Memory Tracking Issues**: The `SimpleMemoryTracker` uses platform commands (`ps`, `/proc`, `tasklist`). Ensure these are available and the script has permission. If it consistently fails, memory reporting will be limited.
|
||||
- **Visualization Missing**: Related to `benchmark_report.py` and its dependencies.
|
||||
- **Site Generation Issues**: Check permissions for creating `./test_site/`. Use `--skip-generation` if you want to manage the site manually.
|
||||
- **Testing Against External Site**: Ensure the external site is running and use `--use-existing-site --port <correct_port>`.
|
||||
887
tests/memory/benchmark_report.py
Executable file
887
tests/memory/benchmark_report.py
Executable file
@@ -0,0 +1,887 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Benchmark reporting tool for Crawl4AI stress tests.
|
||||
Generates visual reports and comparisons between test runs.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import glob
|
||||
import argparse
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
|
||||
# Initialize rich console
|
||||
console = Console()
|
||||
|
||||
# Try to import optional visualization dependencies
|
||||
VISUALIZATION_AVAILABLE = True
|
||||
try:
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib as mpl
|
||||
import numpy as np
|
||||
import seaborn as sns
|
||||
except ImportError:
|
||||
VISUALIZATION_AVAILABLE = False
|
||||
console.print("[yellow]Warning: Visualization dependencies not found. Install with:[/yellow]")
|
||||
console.print("[yellow]pip install pandas matplotlib seaborn[/yellow]")
|
||||
console.print("[yellow]Only text-based reports will be generated.[/yellow]")
|
||||
|
||||
# Configure plotting if available
|
||||
if VISUALIZATION_AVAILABLE:
|
||||
# Set plot style for dark theme
|
||||
plt.style.use('dark_background')
|
||||
sns.set_theme(style="darkgrid")
|
||||
|
||||
# Custom color palette based on Nord theme
|
||||
nord_palette = ["#88c0d0", "#81a1c1", "#a3be8c", "#ebcb8b", "#bf616a", "#b48ead", "#5e81ac"]
|
||||
sns.set_palette(nord_palette)
|
||||
|
||||
class BenchmarkReporter:
|
||||
"""Generates visual reports and comparisons for Crawl4AI stress tests."""
|
||||
|
||||
def __init__(self, reports_dir="reports", output_dir="benchmark_reports"):
|
||||
"""Initialize the benchmark reporter.
|
||||
|
||||
Args:
|
||||
reports_dir: Directory containing test result files
|
||||
output_dir: Directory to save generated reports
|
||||
"""
|
||||
self.reports_dir = Path(reports_dir)
|
||||
self.output_dir = Path(output_dir)
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Configure matplotlib if available
|
||||
if VISUALIZATION_AVAILABLE:
|
||||
# Ensure the matplotlib backend works in headless environments
|
||||
mpl.use('Agg')
|
||||
|
||||
# Set up styling for plots with dark theme
|
||||
mpl.rcParams['figure.figsize'] = (12, 8)
|
||||
mpl.rcParams['font.size'] = 12
|
||||
mpl.rcParams['axes.labelsize'] = 14
|
||||
mpl.rcParams['axes.titlesize'] = 16
|
||||
mpl.rcParams['xtick.labelsize'] = 12
|
||||
mpl.rcParams['ytick.labelsize'] = 12
|
||||
mpl.rcParams['legend.fontsize'] = 12
|
||||
mpl.rcParams['figure.facecolor'] = '#1e1e1e'
|
||||
mpl.rcParams['axes.facecolor'] = '#2e3440'
|
||||
mpl.rcParams['savefig.facecolor'] = '#1e1e1e'
|
||||
mpl.rcParams['text.color'] = '#e0e0e0'
|
||||
mpl.rcParams['axes.labelcolor'] = '#e0e0e0'
|
||||
mpl.rcParams['xtick.color'] = '#e0e0e0'
|
||||
mpl.rcParams['ytick.color'] = '#e0e0e0'
|
||||
mpl.rcParams['grid.color'] = '#444444'
|
||||
mpl.rcParams['figure.edgecolor'] = '#444444'
|
||||
|
||||
def load_test_results(self, limit=None):
|
||||
"""Load all test results from the reports directory.
|
||||
|
||||
Args:
|
||||
limit: Optional limit on number of most recent tests to load
|
||||
|
||||
Returns:
|
||||
Dictionary mapping test IDs to result data
|
||||
"""
|
||||
result_files = glob.glob(str(self.reports_dir / "test_results_*.json"))
|
||||
|
||||
# Sort files by modification time (newest first)
|
||||
result_files.sort(key=os.path.getmtime, reverse=True)
|
||||
|
||||
if limit:
|
||||
result_files = result_files[:limit]
|
||||
|
||||
results = {}
|
||||
for file_path in result_files:
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
test_id = data.get('test_id')
|
||||
if test_id:
|
||||
results[test_id] = data
|
||||
|
||||
# Try to load the corresponding memory samples
|
||||
csv_path = self.reports_dir / f"memory_samples_{test_id}.csv"
|
||||
if csv_path.exists():
|
||||
try:
|
||||
memory_df = pd.read_csv(csv_path)
|
||||
results[test_id]['memory_samples'] = memory_df
|
||||
except Exception as e:
|
||||
console.print(f"[yellow]Warning: Could not load memory samples for {test_id}: {e}[/yellow]")
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error loading {file_path}: {e}[/red]")
|
||||
|
||||
console.print(f"Loaded {len(results)} test results")
|
||||
return results
|
||||
|
||||
def generate_summary_table(self, results):
|
||||
"""Generate a summary table of test results.
|
||||
|
||||
Args:
|
||||
results: Dictionary mapping test IDs to result data
|
||||
|
||||
Returns:
|
||||
Rich Table object
|
||||
"""
|
||||
table = Table(title="Crawl4AI Stress Test Summary", show_header=True)
|
||||
|
||||
# Define columns
|
||||
table.add_column("Test ID", style="cyan")
|
||||
table.add_column("Date", style="bright_green")
|
||||
table.add_column("URLs", justify="right")
|
||||
table.add_column("Workers", justify="right")
|
||||
table.add_column("Success %", justify="right")
|
||||
table.add_column("Time (s)", justify="right")
|
||||
table.add_column("Mem Growth", justify="right")
|
||||
table.add_column("URLs/sec", justify="right")
|
||||
|
||||
# Add rows
|
||||
for test_id, data in sorted(results.items(), key=lambda x: x[0], reverse=True):
|
||||
# Parse timestamp from test_id
|
||||
try:
|
||||
date_str = datetime.strptime(test_id, "%Y%m%d_%H%M%S").strftime("%Y-%m-%d %H:%M")
|
||||
except:
|
||||
date_str = "Unknown"
|
||||
|
||||
# Calculate success percentage
|
||||
total_urls = data.get('url_count', 0)
|
||||
successful = data.get('successful_urls', 0)
|
||||
success_pct = (successful / total_urls * 100) if total_urls > 0 else 0
|
||||
|
||||
# Calculate memory growth if available
|
||||
mem_growth = "N/A"
|
||||
if 'memory_samples' in data:
|
||||
samples = data['memory_samples']
|
||||
if len(samples) >= 2:
|
||||
# Try to extract numeric values from memory_info strings
|
||||
try:
|
||||
first_mem = float(samples.iloc[0]['memory_info'].split()[0])
|
||||
last_mem = float(samples.iloc[-1]['memory_info'].split()[0])
|
||||
mem_growth = f"{last_mem - first_mem:.1f} MB"
|
||||
except:
|
||||
pass
|
||||
|
||||
# Calculate URLs per second
|
||||
time_taken = data.get('total_time_seconds', 0)
|
||||
urls_per_sec = total_urls / time_taken if time_taken > 0 else 0
|
||||
|
||||
table.add_row(
|
||||
test_id,
|
||||
date_str,
|
||||
str(total_urls),
|
||||
str(data.get('workers', 'N/A')),
|
||||
f"{success_pct:.1f}%",
|
||||
f"{data.get('total_time_seconds', 0):.2f}",
|
||||
mem_growth,
|
||||
f"{urls_per_sec:.1f}"
|
||||
)
|
||||
|
||||
return table
|
||||
|
||||
def generate_performance_chart(self, results, output_file=None):
|
||||
"""Generate a performance comparison chart.
|
||||
|
||||
Args:
|
||||
results: Dictionary mapping test IDs to result data
|
||||
output_file: File path to save the chart
|
||||
|
||||
Returns:
|
||||
Path to the saved chart file or None if visualization is not available
|
||||
"""
|
||||
if not VISUALIZATION_AVAILABLE:
|
||||
console.print("[yellow]Skipping performance chart - visualization dependencies not available[/yellow]")
|
||||
return None
|
||||
|
||||
# Extract relevant data
|
||||
data = []
|
||||
for test_id, result in results.items():
|
||||
urls = result.get('url_count', 0)
|
||||
workers = result.get('workers', 0)
|
||||
time_taken = result.get('total_time_seconds', 0)
|
||||
urls_per_sec = urls / time_taken if time_taken > 0 else 0
|
||||
|
||||
# Parse timestamp from test_id for sorting
|
||||
try:
|
||||
timestamp = datetime.strptime(test_id, "%Y%m%d_%H%M%S")
|
||||
data.append({
|
||||
'test_id': test_id,
|
||||
'timestamp': timestamp,
|
||||
'urls': urls,
|
||||
'workers': workers,
|
||||
'time_seconds': time_taken,
|
||||
'urls_per_sec': urls_per_sec
|
||||
})
|
||||
except:
|
||||
console.print(f"[yellow]Warning: Could not parse timestamp from {test_id}[/yellow]")
|
||||
|
||||
if not data:
|
||||
console.print("[yellow]No valid data for performance chart[/yellow]")
|
||||
return None
|
||||
|
||||
# Convert to DataFrame and sort by timestamp
|
||||
df = pd.DataFrame(data)
|
||||
df = df.sort_values('timestamp')
|
||||
|
||||
# Create the plot
|
||||
fig, ax1 = plt.subplots(figsize=(12, 6))
|
||||
|
||||
# Plot URLs per second as bars with properly set x-axis
|
||||
x_pos = range(len(df['test_id']))
|
||||
bars = ax1.bar(x_pos, df['urls_per_sec'], color='#88c0d0', alpha=0.8)
|
||||
ax1.set_ylabel('URLs per Second', color='#88c0d0')
|
||||
ax1.tick_params(axis='y', labelcolor='#88c0d0')
|
||||
|
||||
# Properly set x-axis labels
|
||||
ax1.set_xticks(x_pos)
|
||||
ax1.set_xticklabels(df['test_id'].tolist(), rotation=45, ha='right')
|
||||
|
||||
# Add worker count as text on each bar
|
||||
for i, bar in enumerate(bars):
|
||||
height = bar.get_height()
|
||||
workers = df.iloc[i]['workers']
|
||||
ax1.text(i, height + 0.1,
|
||||
f'W: {workers}', ha='center', va='bottom', fontsize=9, color='#e0e0e0')
|
||||
|
||||
# Add a second y-axis for total URLs
|
||||
ax2 = ax1.twinx()
|
||||
ax2.plot(x_pos, df['urls'], '-', color='#bf616a', alpha=0.8, markersize=6, marker='o')
|
||||
ax2.set_ylabel('Total URLs', color='#bf616a')
|
||||
ax2.tick_params(axis='y', labelcolor='#bf616a')
|
||||
|
||||
# Set title and layout
|
||||
plt.title('Crawl4AI Performance Benchmarks')
|
||||
plt.tight_layout()
|
||||
|
||||
# Save the figure
|
||||
if output_file is None:
|
||||
output_file = self.output_dir / "performance_comparison.png"
|
||||
plt.savefig(output_file, dpi=100, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
return output_file
|
||||
|
||||
def generate_memory_charts(self, results, output_prefix=None):
|
||||
"""Generate memory usage charts for each test.
|
||||
|
||||
Args:
|
||||
results: Dictionary mapping test IDs to result data
|
||||
output_prefix: Prefix for output file names
|
||||
|
||||
Returns:
|
||||
List of paths to the saved chart files
|
||||
"""
|
||||
if not VISUALIZATION_AVAILABLE:
|
||||
console.print("[yellow]Skipping memory charts - visualization dependencies not available[/yellow]")
|
||||
return []
|
||||
|
||||
output_files = []
|
||||
|
||||
for test_id, result in results.items():
|
||||
if 'memory_samples' not in result:
|
||||
continue
|
||||
|
||||
memory_df = result['memory_samples']
|
||||
|
||||
# Check if we have enough data points
|
||||
if len(memory_df) < 2:
|
||||
continue
|
||||
|
||||
# Try to extract numeric values from memory_info strings
|
||||
try:
|
||||
memory_values = []
|
||||
for mem_str in memory_df['memory_info']:
|
||||
# Extract the number from strings like "142.8 MB"
|
||||
value = float(mem_str.split()[0])
|
||||
memory_values.append(value)
|
||||
|
||||
memory_df['memory_mb'] = memory_values
|
||||
except Exception as e:
|
||||
console.print(f"[yellow]Could not parse memory values for {test_id}: {e}[/yellow]")
|
||||
continue
|
||||
|
||||
# Create the plot
|
||||
plt.figure(figsize=(10, 6))
|
||||
|
||||
# Plot memory usage over time
|
||||
plt.plot(memory_df['elapsed_seconds'], memory_df['memory_mb'],
|
||||
color='#88c0d0', marker='o', linewidth=2, markersize=4)
|
||||
|
||||
# Add annotations for chunk processing
|
||||
chunk_size = result.get('chunk_size', 0)
|
||||
url_count = result.get('url_count', 0)
|
||||
if chunk_size > 0 and url_count > 0:
|
||||
# Estimate chunk processing times
|
||||
num_chunks = (url_count + chunk_size - 1) // chunk_size # Ceiling division
|
||||
total_time = result.get('total_time_seconds', memory_df['elapsed_seconds'].max())
|
||||
chunk_times = np.linspace(0, total_time, num_chunks + 1)[1:]
|
||||
|
||||
for i, time_point in enumerate(chunk_times):
|
||||
if time_point <= memory_df['elapsed_seconds'].max():
|
||||
plt.axvline(x=time_point, color='#4c566a', linestyle='--', alpha=0.6)
|
||||
plt.text(time_point, memory_df['memory_mb'].min(), f'Chunk {i+1}',
|
||||
rotation=90, verticalalignment='bottom', fontsize=8, color='#e0e0e0')
|
||||
|
||||
# Set labels and title
|
||||
plt.xlabel('Elapsed Time (seconds)', color='#e0e0e0')
|
||||
plt.ylabel('Memory Usage (MB)', color='#e0e0e0')
|
||||
plt.title(f'Memory Usage During Test {test_id}\n({url_count} URLs, {result.get("workers", "?")} Workers)',
|
||||
color='#e0e0e0')
|
||||
|
||||
# Add grid and set y-axis to start from zero
|
||||
plt.grid(True, alpha=0.3, color='#4c566a')
|
||||
|
||||
# Add test metadata as text
|
||||
info_text = (
|
||||
f"URLs: {url_count}\n"
|
||||
f"Workers: {result.get('workers', 'N/A')}\n"
|
||||
f"Chunk Size: {result.get('chunk_size', 'N/A')}\n"
|
||||
f"Total Time: {result.get('total_time_seconds', 0):.2f}s\n"
|
||||
)
|
||||
|
||||
# Calculate memory growth
|
||||
if len(memory_df) >= 2:
|
||||
first_mem = memory_df.iloc[0]['memory_mb']
|
||||
last_mem = memory_df.iloc[-1]['memory_mb']
|
||||
growth = last_mem - first_mem
|
||||
growth_rate = growth / result.get('total_time_seconds', 1)
|
||||
|
||||
info_text += f"Memory Growth: {growth:.1f} MB\n"
|
||||
info_text += f"Growth Rate: {growth_rate:.2f} MB/s"
|
||||
|
||||
plt.figtext(0.02, 0.02, info_text, fontsize=9, color='#e0e0e0',
|
||||
bbox=dict(facecolor='#3b4252', alpha=0.8, edgecolor='#4c566a'))
|
||||
|
||||
# Save the figure
|
||||
if output_prefix is None:
|
||||
output_file = self.output_dir / f"memory_chart_{test_id}.png"
|
||||
else:
|
||||
output_file = Path(f"{output_prefix}_memory_{test_id}.png")
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_file, dpi=100, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
output_files.append(output_file)
|
||||
|
||||
return output_files
|
||||
|
||||
def generate_comparison_report(self, results, title=None, output_file=None):
|
||||
"""Generate a comprehensive comparison report of multiple test runs.
|
||||
|
||||
Args:
|
||||
results: Dictionary mapping test IDs to result data
|
||||
title: Optional title for the report
|
||||
output_file: File path to save the report
|
||||
|
||||
Returns:
|
||||
Path to the saved report file
|
||||
"""
|
||||
if not results:
|
||||
console.print("[yellow]No results to generate comparison report[/yellow]")
|
||||
return None
|
||||
|
||||
if output_file is None:
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_file = self.output_dir / f"comparison_report_{timestamp}.html"
|
||||
|
||||
# Create data for the report
|
||||
rows = []
|
||||
for test_id, data in results.items():
|
||||
# Calculate metrics
|
||||
urls = data.get('url_count', 0)
|
||||
workers = data.get('workers', 0)
|
||||
successful = data.get('successful_urls', 0)
|
||||
failed = data.get('failed_urls', 0)
|
||||
time_seconds = data.get('total_time_seconds', 0)
|
||||
|
||||
# Calculate additional metrics
|
||||
success_rate = (successful / urls) * 100 if urls > 0 else 0
|
||||
urls_per_second = urls / time_seconds if time_seconds > 0 else 0
|
||||
urls_per_worker = urls / workers if workers > 0 else 0
|
||||
|
||||
# Calculate memory growth if available
|
||||
mem_start = None
|
||||
mem_end = None
|
||||
mem_growth = None
|
||||
if 'memory_samples' in data:
|
||||
samples = data['memory_samples']
|
||||
if len(samples) >= 2:
|
||||
try:
|
||||
first_mem = float(samples.iloc[0]['memory_info'].split()[0])
|
||||
last_mem = float(samples.iloc[-1]['memory_info'].split()[0])
|
||||
mem_start = first_mem
|
||||
mem_end = last_mem
|
||||
mem_growth = last_mem - first_mem
|
||||
except:
|
||||
pass
|
||||
|
||||
# Parse timestamp from test_id
|
||||
try:
|
||||
timestamp = datetime.strptime(test_id, "%Y%m%d_%H%M%S")
|
||||
except:
|
||||
timestamp = None
|
||||
|
||||
rows.append({
|
||||
'test_id': test_id,
|
||||
'timestamp': timestamp,
|
||||
'date': timestamp.strftime("%Y-%m-%d %H:%M:%S") if timestamp else "Unknown",
|
||||
'urls': urls,
|
||||
'workers': workers,
|
||||
'chunk_size': data.get('chunk_size', 0),
|
||||
'successful': successful,
|
||||
'failed': failed,
|
||||
'success_rate': success_rate,
|
||||
'time_seconds': time_seconds,
|
||||
'urls_per_second': urls_per_second,
|
||||
'urls_per_worker': urls_per_worker,
|
||||
'memory_start': mem_start,
|
||||
'memory_end': mem_end,
|
||||
'memory_growth': mem_growth
|
||||
})
|
||||
|
||||
# Sort data by timestamp if possible
|
||||
if VISUALIZATION_AVAILABLE:
|
||||
# Convert to DataFrame and sort by timestamp
|
||||
df = pd.DataFrame(rows)
|
||||
if 'timestamp' in df.columns and not df['timestamp'].isna().all():
|
||||
df = df.sort_values('timestamp', ascending=False)
|
||||
else:
|
||||
# Simple sorting without pandas
|
||||
rows.sort(key=lambda x: x.get('timestamp', datetime.now()), reverse=True)
|
||||
df = None
|
||||
|
||||
# Generate HTML report
|
||||
html = []
|
||||
html.append('<!DOCTYPE html>')
|
||||
html.append('<html lang="en">')
|
||||
html.append('<head>')
|
||||
html.append('<meta charset="UTF-8">')
|
||||
html.append('<meta name="viewport" content="width=device-width, initial-scale=1.0">')
|
||||
html.append(f'<title>{title or "Crawl4AI Benchmark Comparison"}</title>')
|
||||
html.append('<style>')
|
||||
html.append('''
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
line-height: 1.6;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
color: #e0e0e0;
|
||||
background-color: #1e1e1e;
|
||||
}
|
||||
h1, h2, h3 {
|
||||
color: #81a1c1;
|
||||
}
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
th, td {
|
||||
text-align: left;
|
||||
padding: 12px;
|
||||
border-bottom: 1px solid #444;
|
||||
}
|
||||
th {
|
||||
background-color: #2e3440;
|
||||
font-weight: bold;
|
||||
}
|
||||
tr:hover {
|
||||
background-color: #2e3440;
|
||||
}
|
||||
a {
|
||||
color: #88c0d0;
|
||||
text-decoration: none;
|
||||
}
|
||||
a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
.chart-container {
|
||||
margin: 30px 0;
|
||||
text-align: center;
|
||||
background-color: #2e3440;
|
||||
padding: 20px;
|
||||
border-radius: 8px;
|
||||
}
|
||||
.chart-container img {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
border: 1px solid #444;
|
||||
box-shadow: 0 0 10px rgba(0,0,0,0.3);
|
||||
}
|
||||
.card {
|
||||
border: 1px solid #444;
|
||||
border-radius: 8px;
|
||||
padding: 15px;
|
||||
margin-bottom: 20px;
|
||||
background-color: #2e3440;
|
||||
box-shadow: 0 0 10px rgba(0,0,0,0.2);
|
||||
}
|
||||
.highlight {
|
||||
background-color: #3b4252;
|
||||
font-weight: bold;
|
||||
}
|
||||
.status-good {
|
||||
color: #a3be8c;
|
||||
}
|
||||
.status-warning {
|
||||
color: #ebcb8b;
|
||||
}
|
||||
.status-bad {
|
||||
color: #bf616a;
|
||||
}
|
||||
''')
|
||||
html.append('</style>')
|
||||
html.append('</head>')
|
||||
html.append('<body>')
|
||||
|
||||
# Header
|
||||
html.append(f'<h1>{title or "Crawl4AI Benchmark Comparison"}</h1>')
|
||||
html.append(f'<p>Report generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>')
|
||||
|
||||
# Summary section
|
||||
html.append('<div class="card">')
|
||||
html.append('<h2>Summary</h2>')
|
||||
html.append('<p>This report compares the performance of Crawl4AI across multiple test runs.</p>')
|
||||
|
||||
# Summary metrics
|
||||
data_available = (VISUALIZATION_AVAILABLE and df is not None and not df.empty) or (not VISUALIZATION_AVAILABLE and len(rows) > 0)
|
||||
if data_available:
|
||||
# Get the latest test data
|
||||
if VISUALIZATION_AVAILABLE and df is not None and not df.empty:
|
||||
latest_test = df.iloc[0]
|
||||
latest_id = latest_test['test_id']
|
||||
else:
|
||||
latest_test = rows[0] # First row (already sorted by timestamp)
|
||||
latest_id = latest_test['test_id']
|
||||
|
||||
html.append('<h3>Latest Test Results</h3>')
|
||||
html.append('<ul>')
|
||||
html.append(f'<li><strong>Test ID:</strong> {latest_id}</li>')
|
||||
html.append(f'<li><strong>Date:</strong> {latest_test["date"]}</li>')
|
||||
html.append(f'<li><strong>URLs:</strong> {latest_test["urls"]}</li>')
|
||||
html.append(f'<li><strong>Workers:</strong> {latest_test["workers"]}</li>')
|
||||
html.append(f'<li><strong>Success Rate:</strong> {latest_test["success_rate"]:.1f}%</li>')
|
||||
html.append(f'<li><strong>Time:</strong> {latest_test["time_seconds"]:.2f} seconds</li>')
|
||||
html.append(f'<li><strong>Performance:</strong> {latest_test["urls_per_second"]:.1f} URLs/second</li>')
|
||||
|
||||
# Check memory growth (handle both pandas and dict mode)
|
||||
memory_growth_available = False
|
||||
if VISUALIZATION_AVAILABLE and df is not None:
|
||||
if pd.notna(latest_test["memory_growth"]):
|
||||
html.append(f'<li><strong>Memory Growth:</strong> {latest_test["memory_growth"]:.1f} MB</li>')
|
||||
memory_growth_available = True
|
||||
else:
|
||||
if latest_test["memory_growth"] is not None:
|
||||
html.append(f'<li><strong>Memory Growth:</strong> {latest_test["memory_growth"]:.1f} MB</li>')
|
||||
memory_growth_available = True
|
||||
|
||||
html.append('</ul>')
|
||||
|
||||
# If we have more than one test, show trend
|
||||
if (VISUALIZATION_AVAILABLE and df is not None and len(df) > 1) or (not VISUALIZATION_AVAILABLE and len(rows) > 1):
|
||||
if VISUALIZATION_AVAILABLE and df is not None:
|
||||
prev_test = df.iloc[1]
|
||||
else:
|
||||
prev_test = rows[1]
|
||||
|
||||
# Calculate performance change
|
||||
perf_change = ((latest_test["urls_per_second"] / prev_test["urls_per_second"]) - 1) * 100 if prev_test["urls_per_second"] > 0 else 0
|
||||
|
||||
status_class = ""
|
||||
if perf_change > 5:
|
||||
status_class = "status-good"
|
||||
elif perf_change < -5:
|
||||
status_class = "status-bad"
|
||||
|
||||
html.append('<h3>Performance Trend</h3>')
|
||||
html.append('<ul>')
|
||||
html.append(f'<li><strong>Performance Change:</strong> <span class="{status_class}">{perf_change:+.1f}%</span> compared to previous test</li>')
|
||||
|
||||
# Memory trend if available
|
||||
memory_trend_available = False
|
||||
if VISUALIZATION_AVAILABLE and df is not None:
|
||||
if pd.notna(latest_test["memory_growth"]) and pd.notna(prev_test["memory_growth"]):
|
||||
mem_change = latest_test["memory_growth"] - prev_test["memory_growth"]
|
||||
memory_trend_available = True
|
||||
else:
|
||||
if latest_test["memory_growth"] is not None and prev_test["memory_growth"] is not None:
|
||||
mem_change = latest_test["memory_growth"] - prev_test["memory_growth"]
|
||||
memory_trend_available = True
|
||||
|
||||
if memory_trend_available:
|
||||
mem_status = ""
|
||||
if mem_change < -1: # Improved (less growth)
|
||||
mem_status = "status-good"
|
||||
elif mem_change > 1: # Worse (more growth)
|
||||
mem_status = "status-bad"
|
||||
|
||||
html.append(f'<li><strong>Memory Trend:</strong> <span class="{mem_status}">{mem_change:+.1f} MB</span> change in memory growth</li>')
|
||||
|
||||
html.append('</ul>')
|
||||
|
||||
html.append('</div>')
|
||||
|
||||
# Generate performance chart if visualization is available
|
||||
if VISUALIZATION_AVAILABLE:
|
||||
perf_chart = self.generate_performance_chart(results)
|
||||
if perf_chart:
|
||||
html.append('<div class="chart-container">')
|
||||
html.append('<h2>Performance Comparison</h2>')
|
||||
html.append(f'<img src="{os.path.relpath(perf_chart, os.path.dirname(output_file))}" alt="Performance Comparison Chart">')
|
||||
html.append('</div>')
|
||||
else:
|
||||
html.append('<div class="chart-container">')
|
||||
html.append('<h2>Performance Comparison</h2>')
|
||||
html.append('<p>Charts not available - install visualization dependencies (pandas, matplotlib, seaborn) to enable.</p>')
|
||||
html.append('</div>')
|
||||
|
||||
# Generate memory charts if visualization is available
|
||||
if VISUALIZATION_AVAILABLE:
|
||||
memory_charts = self.generate_memory_charts(results)
|
||||
if memory_charts:
|
||||
html.append('<div class="chart-container">')
|
||||
html.append('<h2>Memory Usage</h2>')
|
||||
|
||||
for chart in memory_charts:
|
||||
test_id = chart.stem.split('_')[-1]
|
||||
html.append(f'<h3>Test {test_id}</h3>')
|
||||
html.append(f'<img src="{os.path.relpath(chart, os.path.dirname(output_file))}" alt="Memory Chart for {test_id}">')
|
||||
|
||||
html.append('</div>')
|
||||
else:
|
||||
html.append('<div class="chart-container">')
|
||||
html.append('<h2>Memory Usage</h2>')
|
||||
html.append('<p>Charts not available - install visualization dependencies (pandas, matplotlib, seaborn) to enable.</p>')
|
||||
html.append('</div>')
|
||||
|
||||
# Detailed results table
|
||||
html.append('<h2>Detailed Results</h2>')
|
||||
|
||||
# Add the results as an HTML table
|
||||
html.append('<table>')
|
||||
|
||||
# Table headers
|
||||
html.append('<tr>')
|
||||
for col in ['Test ID', 'Date', 'URLs', 'Workers', 'Success %', 'Time (s)', 'URLs/sec', 'Mem Growth (MB)']:
|
||||
html.append(f'<th>{col}</th>')
|
||||
html.append('</tr>')
|
||||
|
||||
# Table rows - handle both pandas DataFrame and list of dicts
|
||||
if VISUALIZATION_AVAILABLE and df is not None:
|
||||
# Using pandas DataFrame
|
||||
for _, row in df.iterrows():
|
||||
html.append('<tr>')
|
||||
html.append(f'<td>{row["test_id"]}</td>')
|
||||
html.append(f'<td>{row["date"]}</td>')
|
||||
html.append(f'<td>{row["urls"]}</td>')
|
||||
html.append(f'<td>{row["workers"]}</td>')
|
||||
html.append(f'<td>{row["success_rate"]:.1f}%</td>')
|
||||
html.append(f'<td>{row["time_seconds"]:.2f}</td>')
|
||||
html.append(f'<td>{row["urls_per_second"]:.1f}</td>')
|
||||
|
||||
# Memory growth cell
|
||||
if pd.notna(row["memory_growth"]):
|
||||
html.append(f'<td>{row["memory_growth"]:.1f}</td>')
|
||||
else:
|
||||
html.append('<td>N/A</td>')
|
||||
|
||||
html.append('</tr>')
|
||||
else:
|
||||
# Using list of dicts (when pandas is not available)
|
||||
for row in rows:
|
||||
html.append('<tr>')
|
||||
html.append(f'<td>{row["test_id"]}</td>')
|
||||
html.append(f'<td>{row["date"]}</td>')
|
||||
html.append(f'<td>{row["urls"]}</td>')
|
||||
html.append(f'<td>{row["workers"]}</td>')
|
||||
html.append(f'<td>{row["success_rate"]:.1f}%</td>')
|
||||
html.append(f'<td>{row["time_seconds"]:.2f}</td>')
|
||||
html.append(f'<td>{row["urls_per_second"]:.1f}</td>')
|
||||
|
||||
# Memory growth cell
|
||||
if row["memory_growth"] is not None:
|
||||
html.append(f'<td>{row["memory_growth"]:.1f}</td>')
|
||||
else:
|
||||
html.append('<td>N/A</td>')
|
||||
|
||||
html.append('</tr>')
|
||||
|
||||
html.append('</table>')
|
||||
|
||||
# Conclusion section
|
||||
html.append('<div class="card">')
|
||||
html.append('<h2>Conclusion</h2>')
|
||||
|
||||
if VISUALIZATION_AVAILABLE and df is not None and not df.empty:
|
||||
# Using pandas for statistics (when available)
|
||||
# Calculate some overall statistics
|
||||
avg_urls_per_sec = df['urls_per_second'].mean()
|
||||
max_urls_per_sec = df['urls_per_second'].max()
|
||||
|
||||
# Determine if we have a trend
|
||||
if len(df) > 1:
|
||||
trend_data = df.sort_values('timestamp')
|
||||
first_perf = trend_data.iloc[0]['urls_per_second']
|
||||
last_perf = trend_data.iloc[-1]['urls_per_second']
|
||||
|
||||
perf_change = ((last_perf / first_perf) - 1) * 100 if first_perf > 0 else 0
|
||||
|
||||
if perf_change > 10:
|
||||
trend_desc = "significantly improved"
|
||||
trend_class = "status-good"
|
||||
elif perf_change > 5:
|
||||
trend_desc = "improved"
|
||||
trend_class = "status-good"
|
||||
elif perf_change < -10:
|
||||
trend_desc = "significantly decreased"
|
||||
trend_class = "status-bad"
|
||||
elif perf_change < -5:
|
||||
trend_desc = "decreased"
|
||||
trend_class = "status-bad"
|
||||
else:
|
||||
trend_desc = "remained stable"
|
||||
trend_class = ""
|
||||
|
||||
html.append(f'<p>Overall performance has <span class="{trend_class}">{trend_desc}</span> over the test period.</p>')
|
||||
|
||||
html.append(f'<p>Average throughput: <strong>{avg_urls_per_sec:.1f}</strong> URLs/second</p>')
|
||||
html.append(f'<p>Maximum throughput: <strong>{max_urls_per_sec:.1f}</strong> URLs/second</p>')
|
||||
|
||||
# Memory leak assessment
|
||||
if 'memory_growth' in df.columns and not df['memory_growth'].isna().all():
|
||||
avg_growth = df['memory_growth'].mean()
|
||||
max_growth = df['memory_growth'].max()
|
||||
|
||||
if avg_growth < 5:
|
||||
leak_assessment = "No significant memory leaks detected"
|
||||
leak_class = "status-good"
|
||||
elif avg_growth < 10:
|
||||
leak_assessment = "Minor memory growth observed"
|
||||
leak_class = "status-warning"
|
||||
else:
|
||||
leak_assessment = "Potential memory leak detected"
|
||||
leak_class = "status-bad"
|
||||
|
||||
html.append(f'<p><span class="{leak_class}">{leak_assessment}</span>. Average memory growth: <strong>{avg_growth:.1f} MB</strong> per test.</p>')
|
||||
else:
|
||||
# Manual calculations without pandas
|
||||
if rows:
|
||||
# Calculate average and max throughput
|
||||
total_urls_per_sec = sum(row['urls_per_second'] for row in rows)
|
||||
avg_urls_per_sec = total_urls_per_sec / len(rows)
|
||||
max_urls_per_sec = max(row['urls_per_second'] for row in rows)
|
||||
|
||||
html.append(f'<p>Average throughput: <strong>{avg_urls_per_sec:.1f}</strong> URLs/second</p>')
|
||||
html.append(f'<p>Maximum throughput: <strong>{max_urls_per_sec:.1f}</strong> URLs/second</p>')
|
||||
|
||||
# Memory assessment (simplified without pandas)
|
||||
growth_values = [row['memory_growth'] for row in rows if row['memory_growth'] is not None]
|
||||
if growth_values:
|
||||
avg_growth = sum(growth_values) / len(growth_values)
|
||||
|
||||
if avg_growth < 5:
|
||||
leak_assessment = "No significant memory leaks detected"
|
||||
leak_class = "status-good"
|
||||
elif avg_growth < 10:
|
||||
leak_assessment = "Minor memory growth observed"
|
||||
leak_class = "status-warning"
|
||||
else:
|
||||
leak_assessment = "Potential memory leak detected"
|
||||
leak_class = "status-bad"
|
||||
|
||||
html.append(f'<p><span class="{leak_class}">{leak_assessment}</span>. Average memory growth: <strong>{avg_growth:.1f} MB</strong> per test.</p>')
|
||||
else:
|
||||
html.append('<p>No test data available for analysis.</p>')
|
||||
|
||||
html.append('</div>')
|
||||
|
||||
# Footer
|
||||
html.append('<div style="margin-top: 30px; text-align: center; color: #777; font-size: 0.9em;">')
|
||||
html.append('<p>Generated by Crawl4AI Benchmark Reporter</p>')
|
||||
html.append('</div>')
|
||||
|
||||
html.append('</body>')
|
||||
html.append('</html>')
|
||||
|
||||
# Write the HTML file
|
||||
with open(output_file, 'w') as f:
|
||||
f.write('\n'.join(html))
|
||||
|
||||
# Print a clickable link for terminals that support it (iTerm, VS Code, etc.)
|
||||
file_url = f"file://{os.path.abspath(output_file)}"
|
||||
console.print(f"[green]Comparison report saved to: {output_file}[/green]")
|
||||
console.print(f"[blue underline]Click to open report: {file_url}[/blue underline]")
|
||||
return output_file
|
||||
|
||||
def run(self, limit=None, output_file=None):
|
||||
"""Generate a full benchmark report.
|
||||
|
||||
Args:
|
||||
limit: Optional limit on number of most recent tests to include
|
||||
output_file: Optional output file path
|
||||
|
||||
Returns:
|
||||
Path to the generated report file
|
||||
"""
|
||||
# Load test results
|
||||
results = self.load_test_results(limit=limit)
|
||||
|
||||
if not results:
|
||||
console.print("[yellow]No test results found. Run some tests first.[/yellow]")
|
||||
return None
|
||||
|
||||
# Generate and display summary table
|
||||
summary_table = self.generate_summary_table(results)
|
||||
console.print(summary_table)
|
||||
|
||||
# Generate comparison report
|
||||
title = f"Crawl4AI Benchmark Report ({len(results)} test runs)"
|
||||
report_file = self.generate_comparison_report(results, title=title, output_file=output_file)
|
||||
|
||||
if report_file:
|
||||
console.print(f"[bold green]Report generated successfully: {report_file}[/bold green]")
|
||||
return report_file
|
||||
else:
|
||||
console.print("[bold red]Failed to generate report[/bold red]")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the benchmark reporter."""
|
||||
parser = argparse.ArgumentParser(description="Generate benchmark reports for Crawl4AI stress tests")
|
||||
|
||||
parser.add_argument("--reports-dir", type=str, default="reports",
|
||||
help="Directory containing test result files")
|
||||
parser.add_argument("--output-dir", type=str, default="benchmark_reports",
|
||||
help="Directory to save generated reports")
|
||||
parser.add_argument("--limit", type=int, default=None,
|
||||
help="Limit to most recent N test results")
|
||||
parser.add_argument("--output-file", type=str, default=None,
|
||||
help="Custom output file path for the report")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Create the benchmark reporter
|
||||
reporter = BenchmarkReporter(reports_dir=args.reports_dir, output_dir=args.output_dir)
|
||||
|
||||
# Generate the report
|
||||
report_file = reporter.run(limit=args.limit, output_file=args.output_file)
|
||||
|
||||
if report_file:
|
||||
print(f"Report generated at: {report_file}")
|
||||
return 0
|
||||
else:
|
||||
print("Failed to generate report")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
sys.exit(main())
|
||||
34
tests/memory/cap_test.py
Normal file
34
tests/memory/cap_test.py
Normal file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Hammer /crawl with many concurrent requests to prove GLOBAL_SEM works.
|
||||
"""
|
||||
|
||||
import asyncio, httpx, json, uuid, argparse
|
||||
|
||||
API = "http://localhost:8020/crawl"
|
||||
URLS_PER_CALL = 1 # keep it minimal so each arun() == 1 page
|
||||
CONCURRENT_CALLS = 20 # way above your cap
|
||||
|
||||
payload_template = {
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {"cache_mode": "BYPASS", "verbose": False},
|
||||
}
|
||||
}
|
||||
|
||||
async def one_call(client):
|
||||
payload = payload_template.copy()
|
||||
payload["urls"] = [f"https://httpbin.org/anything/{uuid.uuid4()}"]
|
||||
r = await client.post(API, json=payload)
|
||||
r.raise_for_status()
|
||||
return r.json()["server_peak_memory_mb"]
|
||||
|
||||
async def main():
|
||||
async with httpx.AsyncClient(timeout=60) as client:
|
||||
tasks = [asyncio.create_task(one_call(client)) for _ in range(CONCURRENT_CALLS)]
|
||||
mem_usages = await asyncio.gather(*tasks)
|
||||
print("Calls finished OK, server peaks reported:", mem_usages)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
4
tests/memory/requirements.txt
Normal file
4
tests/memory/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
pandas>=1.5.0
|
||||
matplotlib>=3.5.0
|
||||
seaborn>=0.12.0
|
||||
rich>=12.0.0
|
||||
259
tests/memory/run_benchmark.py
Executable file
259
tests/memory/run_benchmark.py
Executable file
@@ -0,0 +1,259 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Run a complete Crawl4AI benchmark test using test_stress_sdk.py and generate a report.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
import argparse
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
from rich.console import Console
|
||||
from rich.text import Text
|
||||
|
||||
console = Console()
|
||||
|
||||
# Updated TEST_CONFIGS to use max_sessions
|
||||
TEST_CONFIGS = {
|
||||
"quick": {"urls": 50, "max_sessions": 4, "chunk_size": 10, "description": "Quick test (50 URLs, 4 sessions)"},
|
||||
"small": {"urls": 100, "max_sessions": 8, "chunk_size": 20, "description": "Small test (100 URLs, 8 sessions)"},
|
||||
"medium": {"urls": 500, "max_sessions": 16, "chunk_size": 50, "description": "Medium test (500 URLs, 16 sessions)"},
|
||||
"large": {"urls": 1000, "max_sessions": 32, "chunk_size": 100,"description": "Large test (1000 URLs, 32 sessions)"},
|
||||
"extreme": {"urls": 2000, "max_sessions": 64, "chunk_size": 200,"description": "Extreme test (2000 URLs, 64 sessions)"},
|
||||
}
|
||||
|
||||
# Arguments to forward directly if present in custom_args
|
||||
FORWARD_ARGS = {
|
||||
"urls": "--urls",
|
||||
"max_sessions": "--max-sessions",
|
||||
"chunk_size": "--chunk-size",
|
||||
"port": "--port",
|
||||
"monitor_mode": "--monitor-mode",
|
||||
}
|
||||
# Boolean flags to forward if True
|
||||
FORWARD_FLAGS = {
|
||||
"stream": "--stream",
|
||||
"use_rate_limiter": "--use-rate-limiter",
|
||||
"keep_server_alive": "--keep-server-alive",
|
||||
"use_existing_site": "--use-existing-site",
|
||||
"skip_generation": "--skip-generation",
|
||||
"keep_site": "--keep-site",
|
||||
"clean_reports": "--clean-reports", # Note: clean behavior is handled here, but pass flag if needed
|
||||
"clean_site": "--clean-site", # Note: clean behavior is handled here, but pass flag if needed
|
||||
}
|
||||
|
||||
def run_benchmark(config_name, custom_args=None, compare=True, clean=False):
|
||||
"""Runs the stress test and optionally the report generator."""
|
||||
if config_name not in TEST_CONFIGS and config_name != "custom":
|
||||
console.print(f"[bold red]Unknown configuration: {config_name}[/bold red]")
|
||||
return False
|
||||
|
||||
# Print header
|
||||
title = "Crawl4AI SDK Benchmark Test"
|
||||
if config_name != "custom":
|
||||
title += f" - {TEST_CONFIGS[config_name]['description']}"
|
||||
else:
|
||||
# Safely get custom args for title
|
||||
urls = custom_args.get('urls', '?') if custom_args else '?'
|
||||
sessions = custom_args.get('max_sessions', '?') if custom_args else '?'
|
||||
title += f" - Custom ({urls} URLs, {sessions} sessions)"
|
||||
|
||||
console.print(f"\n[bold blue]{title}[/bold blue]")
|
||||
console.print("=" * (len(title) + 4)) # Adjust underline length
|
||||
|
||||
console.print("\n[bold white]Preparing test...[/bold white]")
|
||||
|
||||
# --- Command Construction ---
|
||||
# Use the new script name
|
||||
cmd = ["python", "test_stress_sdk.py"]
|
||||
|
||||
# Apply config or custom args
|
||||
args_to_use = {}
|
||||
if config_name != "custom":
|
||||
args_to_use = TEST_CONFIGS[config_name].copy()
|
||||
# If custom args are provided (e.g., boolean flags), overlay them
|
||||
if custom_args:
|
||||
args_to_use.update(custom_args)
|
||||
elif custom_args: # Custom config
|
||||
args_to_use = custom_args.copy()
|
||||
|
||||
# Add arguments with values
|
||||
for key, arg_name in FORWARD_ARGS.items():
|
||||
if key in args_to_use:
|
||||
cmd.extend([arg_name, str(args_to_use[key])])
|
||||
|
||||
# Add boolean flags
|
||||
for key, flag_name in FORWARD_FLAGS.items():
|
||||
if args_to_use.get(key, False): # Check if key exists and is True
|
||||
# Special handling for clean flags - apply locally, don't forward?
|
||||
# Decide if test_stress_sdk.py also needs --clean flags or if run_benchmark handles it.
|
||||
# For now, let's assume run_benchmark handles cleaning based on its own --clean flag.
|
||||
# We'll forward other flags.
|
||||
if key not in ["clean_reports", "clean_site"]:
|
||||
cmd.append(flag_name)
|
||||
|
||||
# Handle the top-level --clean flag for run_benchmark
|
||||
if clean:
|
||||
# Pass clean flags to the stress test script as well, if needed
|
||||
# This assumes test_stress_sdk.py also uses --clean-reports and --clean-site
|
||||
cmd.append("--clean-reports")
|
||||
cmd.append("--clean-site")
|
||||
console.print("[yellow]Applying --clean: Cleaning reports and site before test.[/yellow]")
|
||||
# Actual cleaning logic might reside here or be delegated entirely
|
||||
|
||||
console.print(f"\n[bold white]Running stress test:[/bold white] {' '.join(cmd)}")
|
||||
start = time.time()
|
||||
|
||||
# Execute the stress test script
|
||||
# Use Popen to stream output
|
||||
try:
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, encoding='utf-8', errors='replace')
|
||||
while True:
|
||||
line = proc.stdout.readline()
|
||||
if not line:
|
||||
break
|
||||
console.print(line.rstrip()) # Print line by line
|
||||
proc.wait() # Wait for the process to complete
|
||||
except FileNotFoundError:
|
||||
console.print(f"[bold red]Error: Script 'test_stress_sdk.py' not found. Make sure it's in the correct directory.[/bold red]")
|
||||
return False
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Error running stress test subprocess: {e}[/bold red]")
|
||||
return False
|
||||
|
||||
|
||||
if proc.returncode != 0:
|
||||
console.print(f"[bold red]Stress test failed with exit code {proc.returncode}[/bold red]")
|
||||
return False
|
||||
|
||||
duration = time.time() - start
|
||||
console.print(f"[bold green]Stress test completed in {duration:.1f} seconds[/bold green]")
|
||||
|
||||
# --- Report Generation (Optional) ---
|
||||
if compare:
|
||||
# Assuming benchmark_report.py exists and works with the generated reports
|
||||
report_script = "benchmark_report.py" # Keep configurable if needed
|
||||
report_cmd = ["python", report_script]
|
||||
console.print(f"\n[bold white]Generating benchmark report: {' '.join(report_cmd)}[/bold white]")
|
||||
|
||||
# Run the report command and capture output
|
||||
try:
|
||||
report_proc = subprocess.run(report_cmd, capture_output=True, text=True, check=False, encoding='utf-8', errors='replace') # Use check=False to handle potential errors
|
||||
|
||||
# Print the captured output from benchmark_report.py
|
||||
if report_proc.stdout:
|
||||
console.print("\n" + report_proc.stdout)
|
||||
if report_proc.stderr:
|
||||
console.print("[yellow]Report generator stderr:[/yellow]\n" + report_proc.stderr)
|
||||
|
||||
if report_proc.returncode != 0:
|
||||
console.print(f"[bold yellow]Benchmark report generation script '{report_script}' failed with exit code {report_proc.returncode}[/bold yellow]")
|
||||
# Don't return False here, test itself succeeded
|
||||
else:
|
||||
console.print(f"[bold green]Benchmark report script '{report_script}' completed.[/bold green]")
|
||||
|
||||
# Find and print clickable links to the reports
|
||||
# Assuming reports are saved in 'benchmark_reports' by benchmark_report.py
|
||||
report_dir = "benchmark_reports"
|
||||
if os.path.isdir(report_dir):
|
||||
report_files = glob.glob(os.path.join(report_dir, "comparison_report_*.html"))
|
||||
if report_files:
|
||||
try:
|
||||
latest_report = max(report_files, key=os.path.getctime)
|
||||
report_path = os.path.abspath(latest_report)
|
||||
report_url = pathlib.Path(report_path).as_uri() # Better way to create file URI
|
||||
console.print(f"[bold cyan]Click to open report: [link={report_url}]{report_url}[/link][/bold cyan]")
|
||||
except Exception as e:
|
||||
console.print(f"[yellow]Could not determine latest report: {e}[/yellow]")
|
||||
|
||||
chart_files = glob.glob(os.path.join(report_dir, "memory_chart_*.png"))
|
||||
if chart_files:
|
||||
try:
|
||||
latest_chart = max(chart_files, key=os.path.getctime)
|
||||
chart_path = os.path.abspath(latest_chart)
|
||||
chart_url = pathlib.Path(chart_path).as_uri()
|
||||
console.print(f"[cyan]Memory chart: [link={chart_url}]{chart_url}[/link][/cyan]")
|
||||
except Exception as e:
|
||||
console.print(f"[yellow]Could not determine latest chart: {e}[/yellow]")
|
||||
else:
|
||||
console.print(f"[yellow]Benchmark report directory '{report_dir}' not found. Cannot link reports.[/yellow]")
|
||||
|
||||
except FileNotFoundError:
|
||||
console.print(f"[bold red]Error: Report script '{report_script}' not found.[/bold red]")
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Error running report generation subprocess: {e}[/bold red]")
|
||||
|
||||
|
||||
# Prompt to exit
|
||||
console.print("\n[bold green]Benchmark run finished. Press Enter to exit.[/bold green]")
|
||||
try:
|
||||
input() # Wait for user input
|
||||
except EOFError:
|
||||
pass # Handle case where input is piped or unavailable
|
||||
|
||||
return True
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run a Crawl4AI SDK benchmark test and generate a report")
|
||||
|
||||
# --- Arguments ---
|
||||
parser.add_argument("config", choices=list(TEST_CONFIGS) + ["custom"],
|
||||
help="Test configuration: quick, small, medium, large, extreme, or custom")
|
||||
|
||||
# Arguments for 'custom' config or to override presets
|
||||
parser.add_argument("--urls", type=int, help="Number of URLs")
|
||||
parser.add_argument("--max-sessions", type=int, help="Max concurrent sessions (replaces --workers)")
|
||||
parser.add_argument("--chunk-size", type=int, help="URLs per batch (for non-stream logging)")
|
||||
parser.add_argument("--port", type=int, help="HTTP server port")
|
||||
parser.add_argument("--monitor-mode", type=str, choices=["DETAILED", "AGGREGATED"], help="Monitor display mode")
|
||||
|
||||
# Boolean flags / options
|
||||
parser.add_argument("--stream", action="store_true", help="Enable streaming results (disables batch logging)")
|
||||
parser.add_argument("--use-rate-limiter", action="store_true", help="Enable basic rate limiter")
|
||||
parser.add_argument("--no-report", action="store_true", help="Skip generating comparison report")
|
||||
parser.add_argument("--clean", action="store_true", help="Clean up reports and site before running")
|
||||
parser.add_argument("--keep-server-alive", action="store_true", help="Keep HTTP server running after test")
|
||||
parser.add_argument("--use-existing-site", action="store_true", help="Use existing site on specified port")
|
||||
parser.add_argument("--skip-generation", action="store_true", help="Use existing site files without regenerating")
|
||||
parser.add_argument("--keep-site", action="store_true", help="Keep generated site files after test")
|
||||
# Removed url_level_logging as it's implicitly handled by stream/batch mode now
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
custom_args = {}
|
||||
|
||||
# Populate custom_args from explicit command-line args
|
||||
if args.urls is not None: custom_args["urls"] = args.urls
|
||||
if args.max_sessions is not None: custom_args["max_sessions"] = args.max_sessions
|
||||
if args.chunk_size is not None: custom_args["chunk_size"] = args.chunk_size
|
||||
if args.port is not None: custom_args["port"] = args.port
|
||||
if args.monitor_mode is not None: custom_args["monitor_mode"] = args.monitor_mode
|
||||
if args.stream: custom_args["stream"] = True
|
||||
if args.use_rate_limiter: custom_args["use_rate_limiter"] = True
|
||||
if args.keep_server_alive: custom_args["keep_server_alive"] = True
|
||||
if args.use_existing_site: custom_args["use_existing_site"] = True
|
||||
if args.skip_generation: custom_args["skip_generation"] = True
|
||||
if args.keep_site: custom_args["keep_site"] = True
|
||||
# Clean flags are handled by the 'clean' argument passed to run_benchmark
|
||||
|
||||
# Validate custom config requirements
|
||||
if args.config == "custom":
|
||||
required_custom = ["urls", "max_sessions", "chunk_size"]
|
||||
missing = [f"--{arg}" for arg in required_custom if arg not in custom_args]
|
||||
if missing:
|
||||
console.print(f"[bold red]Error: 'custom' config requires: {', '.join(missing)}[/bold red]")
|
||||
return 1
|
||||
|
||||
success = run_benchmark(
|
||||
config_name=args.config,
|
||||
custom_args=custom_args, # Pass all collected custom args
|
||||
compare=not args.no_report,
|
||||
clean=args.clean
|
||||
)
|
||||
return 0 if success else 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
168
tests/memory/test_crawler_monitor.py
Normal file
168
tests/memory/test_crawler_monitor.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""
|
||||
Test script for the CrawlerMonitor component.
|
||||
This script simulates a crawler with multiple tasks to demonstrate the real-time monitoring capabilities.
|
||||
"""
|
||||
|
||||
import time
|
||||
import uuid
|
||||
import random
|
||||
import threading
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the parent directory to the path to import crawl4ai
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
|
||||
|
||||
from crawl4ai.components.crawler_monitor import CrawlerMonitor
|
||||
from crawl4ai.models import CrawlStatus
|
||||
|
||||
def simulate_crawler_task(monitor, task_id, url, simulate_failure=False):
|
||||
"""Simulate a crawler task with different states."""
|
||||
# Task starts in the QUEUED state
|
||||
wait_time = random.uniform(0.5, 3.0)
|
||||
time.sleep(wait_time)
|
||||
|
||||
# Update to IN_PROGRESS state
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.IN_PROGRESS,
|
||||
start_time=time.time(),
|
||||
wait_time=wait_time
|
||||
)
|
||||
|
||||
# Simulate task running
|
||||
process_time = random.uniform(1.0, 5.0)
|
||||
for i in range(int(process_time * 2)):
|
||||
# Simulate memory usage changes
|
||||
memory_usage = random.uniform(5.0, 25.0)
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
memory_usage=memory_usage,
|
||||
peak_memory=max(memory_usage, monitor.get_task_stats(task_id).get("peak_memory", 0))
|
||||
)
|
||||
time.sleep(0.5)
|
||||
|
||||
# Update to COMPLETED or FAILED state
|
||||
if simulate_failure and random.random() < 0.8: # 80% chance of failure if simulate_failure is True
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.FAILED,
|
||||
end_time=time.time(),
|
||||
error_message="Simulated failure: Connection timeout",
|
||||
memory_usage=0.0
|
||||
)
|
||||
else:
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.COMPLETED,
|
||||
end_time=time.time(),
|
||||
memory_usage=0.0
|
||||
)
|
||||
|
||||
def update_queue_stats(monitor, num_queued_tasks):
|
||||
"""Update queue statistics periodically."""
|
||||
while monitor.is_running:
|
||||
queued_tasks = [
|
||||
task for task_id, task in monitor.get_all_task_stats().items()
|
||||
if task["status"] == CrawlStatus.QUEUED.name
|
||||
]
|
||||
|
||||
total_queued = len(queued_tasks)
|
||||
|
||||
if total_queued > 0:
|
||||
current_time = time.time()
|
||||
wait_times = [
|
||||
current_time - task.get("enqueue_time", current_time)
|
||||
for task in queued_tasks
|
||||
]
|
||||
highest_wait_time = max(wait_times) if wait_times else 0.0
|
||||
avg_wait_time = sum(wait_times) / len(wait_times) if wait_times else 0.0
|
||||
else:
|
||||
highest_wait_time = 0.0
|
||||
avg_wait_time = 0.0
|
||||
|
||||
monitor.update_queue_statistics(
|
||||
total_queued=total_queued,
|
||||
highest_wait_time=highest_wait_time,
|
||||
avg_wait_time=avg_wait_time
|
||||
)
|
||||
|
||||
# Simulate memory pressure based on number of active tasks
|
||||
active_tasks = len([
|
||||
task for task_id, task in monitor.get_all_task_stats().items()
|
||||
if task["status"] == CrawlStatus.IN_PROGRESS.name
|
||||
])
|
||||
|
||||
if active_tasks > 8:
|
||||
monitor.update_memory_status("CRITICAL")
|
||||
elif active_tasks > 4:
|
||||
monitor.update_memory_status("PRESSURE")
|
||||
else:
|
||||
monitor.update_memory_status("NORMAL")
|
||||
|
||||
time.sleep(1.0)
|
||||
|
||||
def test_crawler_monitor():
|
||||
"""Test the CrawlerMonitor with simulated crawler tasks."""
|
||||
# Total number of URLs to crawl
|
||||
total_urls = 50
|
||||
|
||||
# Initialize the monitor
|
||||
monitor = CrawlerMonitor(urls_total=total_urls, refresh_rate=0.5)
|
||||
|
||||
# Start the monitor
|
||||
monitor.start()
|
||||
|
||||
# Start thread to update queue statistics
|
||||
queue_stats_thread = threading.Thread(target=update_queue_stats, args=(monitor, total_urls))
|
||||
queue_stats_thread.daemon = True
|
||||
queue_stats_thread.start()
|
||||
|
||||
try:
|
||||
# Create task threads
|
||||
threads = []
|
||||
for i in range(total_urls):
|
||||
task_id = str(uuid.uuid4())
|
||||
url = f"https://example.com/page{i}"
|
||||
|
||||
# Add task to monitor
|
||||
monitor.add_task(task_id, url)
|
||||
|
||||
# Determine if this task should simulate failure
|
||||
simulate_failure = (i % 10 == 0) # Every 10th task
|
||||
|
||||
# Create and start thread for this task
|
||||
thread = threading.Thread(
|
||||
target=simulate_crawler_task,
|
||||
args=(monitor, task_id, url, simulate_failure)
|
||||
)
|
||||
thread.daemon = True
|
||||
threads.append(thread)
|
||||
|
||||
# Start threads with delay to simulate tasks being added over time
|
||||
batch_size = 5
|
||||
for i in range(0, len(threads), batch_size):
|
||||
batch = threads[i:i+batch_size]
|
||||
for thread in batch:
|
||||
thread.start()
|
||||
time.sleep(0.5) # Small delay between starting threads
|
||||
|
||||
# Wait a bit before starting the next batch
|
||||
time.sleep(2.0)
|
||||
|
||||
# Wait for all threads to complete
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
# Keep monitor running a bit longer to see the final state
|
||||
time.sleep(5.0)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nTest interrupted by user")
|
||||
finally:
|
||||
# Stop the monitor
|
||||
monitor.stop()
|
||||
print("\nCrawler monitor test completed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_crawler_monitor()
|
||||
410
tests/memory/test_dispatcher_stress.py
Normal file
410
tests/memory/test_dispatcher_stress.py
Normal file
@@ -0,0 +1,410 @@
|
||||
import asyncio
|
||||
import time
|
||||
import psutil
|
||||
import logging
|
||||
import random
|
||||
from typing import List, Dict
|
||||
import uuid
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Import your crawler components
|
||||
from crawl4ai.models import DisplayMode, CrawlStatus, CrawlResult
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import MemoryAdaptiveDispatcher, CrawlerMonitor
|
||||
|
||||
# Global configuration
|
||||
STREAM = False # Toggle between streaming and non-streaming modes
|
||||
|
||||
# Configure logging to file only (to avoid breaking the rich display)
|
||||
os.makedirs("logs", exist_ok=True)
|
||||
file_handler = logging.FileHandler("logs/memory_stress_test.log")
|
||||
file_handler.setFormatter(logging.Formatter('%(asctime)s [%(levelname)s] %(message)s'))
|
||||
|
||||
# Root logger - only to file, not console
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(logging.INFO)
|
||||
root_logger.addHandler(file_handler)
|
||||
|
||||
# Our test logger also writes to file only
|
||||
logger = logging.getLogger("memory_stress_test")
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.addHandler(file_handler)
|
||||
logger.propagate = False # Don't propagate to root logger
|
||||
|
||||
# Create a memory restrictor to simulate limited memory environment
|
||||
class MemorySimulator:
|
||||
def __init__(self, target_percent: float = 85.0, aggressive: bool = False):
|
||||
"""Simulates memory pressure by allocating memory"""
|
||||
self.target_percent = target_percent
|
||||
self.memory_blocks: List[bytearray] = []
|
||||
self.aggressive = aggressive
|
||||
|
||||
def apply_pressure(self, additional_percent: float = 0.0):
|
||||
"""Fill memory until we reach target percentage"""
|
||||
current_percent = psutil.virtual_memory().percent
|
||||
target = self.target_percent + additional_percent
|
||||
|
||||
if current_percent >= target:
|
||||
return # Already at target
|
||||
|
||||
logger.info(f"Current memory: {current_percent}%, target: {target}%")
|
||||
|
||||
# Calculate how much memory we need to allocate
|
||||
total_memory = psutil.virtual_memory().total
|
||||
target_usage = (target / 100.0) * total_memory
|
||||
current_usage = (current_percent / 100.0) * total_memory
|
||||
bytes_to_allocate = int(target_usage - current_usage)
|
||||
|
||||
if bytes_to_allocate <= 0:
|
||||
return
|
||||
|
||||
# Allocate in smaller chunks to avoid overallocation
|
||||
if self.aggressive:
|
||||
# Use larger chunks for faster allocation in aggressive mode
|
||||
chunk_size = min(bytes_to_allocate, 200 * 1024 * 1024) # 200MB chunks
|
||||
else:
|
||||
chunk_size = min(bytes_to_allocate, 50 * 1024 * 1024) # 50MB chunks
|
||||
|
||||
try:
|
||||
logger.info(f"Allocating {chunk_size / (1024 * 1024):.1f}MB to reach target memory usage")
|
||||
self.memory_blocks.append(bytearray(chunk_size))
|
||||
time.sleep(0.5) # Give system time to register the allocation
|
||||
except MemoryError:
|
||||
logger.warning("Unable to allocate more memory")
|
||||
|
||||
def release_pressure(self, percent: float = None):
|
||||
"""
|
||||
Release allocated memory
|
||||
If percent is specified, release that percentage of blocks
|
||||
"""
|
||||
if not self.memory_blocks:
|
||||
return
|
||||
|
||||
if percent is None:
|
||||
# Release all
|
||||
logger.info(f"Releasing all {len(self.memory_blocks)} memory blocks")
|
||||
self.memory_blocks.clear()
|
||||
else:
|
||||
# Release specified percentage
|
||||
blocks_to_release = int(len(self.memory_blocks) * (percent / 100.0))
|
||||
if blocks_to_release > 0:
|
||||
logger.info(f"Releasing {blocks_to_release} of {len(self.memory_blocks)} memory blocks ({percent}%)")
|
||||
self.memory_blocks = self.memory_blocks[blocks_to_release:]
|
||||
|
||||
def spike_pressure(self, duration: float = 5.0):
|
||||
"""
|
||||
Create a temporary spike in memory pressure then release
|
||||
Useful for forcing requeues
|
||||
"""
|
||||
logger.info(f"Creating memory pressure spike for {duration} seconds")
|
||||
# Save current blocks count
|
||||
initial_blocks = len(self.memory_blocks)
|
||||
|
||||
# Create spike with extra 5%
|
||||
self.apply_pressure(additional_percent=5.0)
|
||||
|
||||
# Schedule release after duration
|
||||
asyncio.create_task(self._delayed_release(duration, initial_blocks))
|
||||
|
||||
async def _delayed_release(self, delay: float, target_blocks: int):
|
||||
"""Helper for spike_pressure - releases extra blocks after delay"""
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
# Remove blocks added since spike started
|
||||
if len(self.memory_blocks) > target_blocks:
|
||||
logger.info(f"Releasing memory spike ({len(self.memory_blocks) - target_blocks} blocks)")
|
||||
self.memory_blocks = self.memory_blocks[:target_blocks]
|
||||
|
||||
# Test statistics collector
|
||||
class TestResults:
|
||||
def __init__(self):
|
||||
self.start_time = time.time()
|
||||
self.completed_urls: List[str] = []
|
||||
self.failed_urls: List[str] = []
|
||||
self.requeued_count = 0
|
||||
self.memory_warnings = 0
|
||||
self.max_memory_usage = 0.0
|
||||
self.max_queue_size = 0
|
||||
self.max_wait_time = 0.0
|
||||
self.url_to_attempt: Dict[str, int] = {} # Track retries per URL
|
||||
|
||||
def log_summary(self):
|
||||
duration = time.time() - self.start_time
|
||||
logger.info("===== TEST SUMMARY =====")
|
||||
logger.info(f"Stream mode: {'ON' if STREAM else 'OFF'}")
|
||||
logger.info(f"Total duration: {duration:.1f} seconds")
|
||||
logger.info(f"Completed URLs: {len(self.completed_urls)}")
|
||||
logger.info(f"Failed URLs: {len(self.failed_urls)}")
|
||||
logger.info(f"Requeue events: {self.requeued_count}")
|
||||
logger.info(f"Memory warnings: {self.memory_warnings}")
|
||||
logger.info(f"Max memory usage: {self.max_memory_usage:.1f}%")
|
||||
logger.info(f"Max queue size: {self.max_queue_size}")
|
||||
logger.info(f"Max wait time: {self.max_wait_time:.1f} seconds")
|
||||
|
||||
# Log URLs with multiple attempts
|
||||
retried_urls = {url: count for url, count in self.url_to_attempt.items() if count > 1}
|
||||
if retried_urls:
|
||||
logger.info(f"URLs with retries: {len(retried_urls)}")
|
||||
# Log the top 5 most retried
|
||||
top_retries = sorted(retried_urls.items(), key=lambda x: x[1], reverse=True)[:5]
|
||||
for url, count in top_retries:
|
||||
logger.info(f" URL {url[-30:]} had {count} attempts")
|
||||
|
||||
# Write summary to a separate human-readable file
|
||||
with open("logs/test_summary.txt", "w") as f:
|
||||
f.write(f"Stream mode: {'ON' if STREAM else 'OFF'}\n")
|
||||
f.write(f"Total duration: {duration:.1f} seconds\n")
|
||||
f.write(f"Completed URLs: {len(self.completed_urls)}\n")
|
||||
f.write(f"Failed URLs: {len(self.failed_urls)}\n")
|
||||
f.write(f"Requeue events: {self.requeued_count}\n")
|
||||
f.write(f"Memory warnings: {self.memory_warnings}\n")
|
||||
f.write(f"Max memory usage: {self.max_memory_usage:.1f}%\n")
|
||||
f.write(f"Max queue size: {self.max_queue_size}\n")
|
||||
f.write(f"Max wait time: {self.max_wait_time:.1f} seconds\n")
|
||||
|
||||
# Custom monitor with stats tracking
|
||||
# Custom monitor that extends CrawlerMonitor with test-specific tracking
|
||||
class StressTestMonitor(CrawlerMonitor):
|
||||
def __init__(self, test_results: TestResults, **kwargs):
|
||||
# Initialize the parent CrawlerMonitor
|
||||
super().__init__(**kwargs)
|
||||
self.test_results = test_results
|
||||
|
||||
def update_memory_status(self, status: str):
|
||||
if status != self.memory_status:
|
||||
logger.info(f"Memory status changed: {self.memory_status} -> {status}")
|
||||
if "CRITICAL" in status or "PRESSURE" in status:
|
||||
self.test_results.memory_warnings += 1
|
||||
|
||||
# Track peak memory usage in test results
|
||||
current_memory = psutil.virtual_memory().percent
|
||||
self.test_results.max_memory_usage = max(self.test_results.max_memory_usage, current_memory)
|
||||
|
||||
# Call parent method to update the dashboard
|
||||
super().update_memory_status(status)
|
||||
|
||||
def update_queue_statistics(self, total_queued: int, highest_wait_time: float, avg_wait_time: float):
|
||||
# Track queue metrics in test results
|
||||
self.test_results.max_queue_size = max(self.test_results.max_queue_size, total_queued)
|
||||
self.test_results.max_wait_time = max(self.test_results.max_wait_time, highest_wait_time)
|
||||
|
||||
# Call parent method to update the dashboard
|
||||
super().update_queue_statistics(total_queued, highest_wait_time, avg_wait_time)
|
||||
|
||||
def update_task(self, task_id: str, **kwargs):
|
||||
# Track URL status changes for test results
|
||||
if task_id in self.stats:
|
||||
old_status = self.stats[task_id].status
|
||||
|
||||
# If this is a requeue event (requeued due to memory pressure)
|
||||
if 'error_message' in kwargs and 'requeued' in kwargs['error_message']:
|
||||
if not hasattr(self.stats[task_id], 'counted_requeue') or not self.stats[task_id].counted_requeue:
|
||||
self.test_results.requeued_count += 1
|
||||
self.stats[task_id].counted_requeue = True
|
||||
|
||||
# Track completion status for test results
|
||||
if 'status' in kwargs:
|
||||
new_status = kwargs['status']
|
||||
if old_status != new_status:
|
||||
if new_status == CrawlStatus.COMPLETED:
|
||||
if task_id not in self.test_results.completed_urls:
|
||||
self.test_results.completed_urls.append(task_id)
|
||||
elif new_status == CrawlStatus.FAILED:
|
||||
if task_id not in self.test_results.failed_urls:
|
||||
self.test_results.failed_urls.append(task_id)
|
||||
|
||||
# Call parent method to update the dashboard
|
||||
super().update_task(task_id, **kwargs)
|
||||
self.live.update(self._create_table())
|
||||
|
||||
# Generate test URLs - use example.com with unique paths to avoid browser caching
|
||||
def generate_test_urls(count: int) -> List[str]:
|
||||
urls = []
|
||||
for i in range(count):
|
||||
# Add random path and query parameters to create unique URLs
|
||||
path = f"/path/{uuid.uuid4()}"
|
||||
query = f"?test={i}&random={random.randint(1, 100000)}"
|
||||
urls.append(f"https://example.com{path}{query}")
|
||||
return urls
|
||||
|
||||
# Process result callback
|
||||
async def process_result(result, test_results: TestResults):
|
||||
# Track attempt counts
|
||||
if result.url not in test_results.url_to_attempt:
|
||||
test_results.url_to_attempt[result.url] = 1
|
||||
else:
|
||||
test_results.url_to_attempt[result.url] += 1
|
||||
|
||||
if "requeued" in result.error_message:
|
||||
test_results.requeued_count += 1
|
||||
logger.debug(f"Requeued due to memory pressure: {result.url}")
|
||||
elif result.success:
|
||||
test_results.completed_urls.append(result.url)
|
||||
logger.debug(f"Successfully processed: {result.url}")
|
||||
else:
|
||||
test_results.failed_urls.append(result.url)
|
||||
logger.warning(f"Failed to process: {result.url} - {result.error_message}")
|
||||
|
||||
# Process multiple results (used in non-streaming mode)
|
||||
async def process_results(results, test_results: TestResults):
|
||||
for result in results:
|
||||
await process_result(result, test_results)
|
||||
|
||||
# Main test function for extreme memory pressure simulation
|
||||
async def run_memory_stress_test(
|
||||
url_count: int = 100,
|
||||
target_memory_percent: float = 92.0, # Push to dangerous levels
|
||||
chunk_size: int = 20, # Larger chunks for more chaos
|
||||
aggressive: bool = False,
|
||||
spikes: bool = True
|
||||
):
|
||||
test_results = TestResults()
|
||||
memory_simulator = MemorySimulator(target_percent=target_memory_percent, aggressive=aggressive)
|
||||
|
||||
logger.info(f"Starting stress test with {url_count} URLs in {'STREAM' if STREAM else 'NON-STREAM'} mode")
|
||||
logger.info(f"Target memory usage: {target_memory_percent}%")
|
||||
|
||||
# First, elevate memory usage to create pressure
|
||||
logger.info("Creating initial memory pressure...")
|
||||
memory_simulator.apply_pressure()
|
||||
|
||||
# Create test URLs in chunks to simulate real-world crawling where URLs are discovered
|
||||
all_urls = generate_test_urls(url_count)
|
||||
url_chunks = [all_urls[i:i+chunk_size] for i in range(0, len(all_urls), chunk_size)]
|
||||
|
||||
# Set up the crawler components - low memory thresholds to create more requeues
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
verbose=False,
|
||||
stream=STREAM # Use the global STREAM variable to set mode
|
||||
)
|
||||
|
||||
# Create monitor with reference to test results
|
||||
monitor = StressTestMonitor(
|
||||
test_results=test_results,
|
||||
display_mode=DisplayMode.DETAILED,
|
||||
max_visible_rows=20,
|
||||
total_urls=url_count # Pass total URLs count
|
||||
)
|
||||
|
||||
# Create dispatcher with EXTREME settings - pure survival mode
|
||||
# These settings are designed to create a memory battleground
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=63.0, # Start throttling at just 60% memory
|
||||
critical_threshold_percent=70.0, # Start requeuing at 70% - incredibly aggressive
|
||||
recovery_threshold_percent=55.0, # Only resume normal ops when plenty of memory available
|
||||
check_interval=0.1, # Check extremely frequently (100ms)
|
||||
max_session_permit=20 if aggressive else 10, # Double the concurrent sessions - pure chaos
|
||||
fairness_timeout=10.0, # Extremely low timeout - rapid priority changes
|
||||
monitor=monitor
|
||||
)
|
||||
|
||||
# Set up spike schedule if enabled
|
||||
if spikes:
|
||||
spike_intervals = []
|
||||
# Create 3-5 random spike times
|
||||
num_spikes = random.randint(3, 5)
|
||||
for _ in range(num_spikes):
|
||||
# Schedule spikes at random chunks
|
||||
chunk_index = random.randint(1, len(url_chunks) - 1)
|
||||
spike_intervals.append(chunk_index)
|
||||
logger.info(f"Scheduled memory spikes at chunks: {spike_intervals}")
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Process URLs in chunks to simulate discovering URLs over time
|
||||
for chunk_index, url_chunk in enumerate(url_chunks):
|
||||
logger.info(f"Processing chunk {chunk_index+1}/{len(url_chunks)} ({len(url_chunk)} URLs)")
|
||||
|
||||
# Regular pressure increases
|
||||
if chunk_index % 2 == 0:
|
||||
logger.info("Increasing memory pressure...")
|
||||
memory_simulator.apply_pressure()
|
||||
|
||||
# Memory spike if scheduled for this chunk
|
||||
if spikes and chunk_index in spike_intervals:
|
||||
logger.info(f"⚠️ CREATING MASSIVE MEMORY SPIKE at chunk {chunk_index+1} ⚠️")
|
||||
# Create a nightmare scenario - multiple overlapping spikes
|
||||
memory_simulator.spike_pressure(duration=10.0) # 10-second spike
|
||||
|
||||
# 50% chance of double-spike (pure evil)
|
||||
if random.random() < 0.5:
|
||||
await asyncio.sleep(2.0) # Wait 2 seconds
|
||||
logger.info("💀 DOUBLE SPIKE - EXTREME MEMORY PRESSURE 💀")
|
||||
memory_simulator.spike_pressure(duration=8.0) # 8-second overlapping spike
|
||||
|
||||
if STREAM:
|
||||
# Stream mode - process results as they come in
|
||||
async for result in dispatcher.run_urls_stream(
|
||||
urls=url_chunk,
|
||||
crawler=crawler,
|
||||
config=run_config
|
||||
):
|
||||
await process_result(result, test_results)
|
||||
else:
|
||||
# Non-stream mode - get all results at once
|
||||
results = await dispatcher.run_urls(
|
||||
urls=url_chunk,
|
||||
crawler=crawler,
|
||||
config=run_config
|
||||
)
|
||||
await process_results(results, test_results)
|
||||
|
||||
# Simulate discovering more URLs while others are still processing
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# RARELY release pressure - make the system fight for resources
|
||||
if chunk_index % 5 == 4: # Less frequent releases
|
||||
release_percent = random.choice([10, 15, 20]) # Smaller, inconsistent releases
|
||||
logger.info(f"Releasing {release_percent}% of memory blocks - brief respite")
|
||||
memory_simulator.release_pressure(percent=release_percent)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test error: {str(e)}")
|
||||
raise
|
||||
finally:
|
||||
# Release memory pressure
|
||||
memory_simulator.release_pressure()
|
||||
# Log final results
|
||||
test_results.log_summary()
|
||||
|
||||
# Check for success criteria
|
||||
if len(test_results.completed_urls) + len(test_results.failed_urls) < url_count:
|
||||
logger.error(f"TEST FAILED: Not all URLs were processed. {url_count - len(test_results.completed_urls) - len(test_results.failed_urls)} URLs missing.")
|
||||
return False
|
||||
|
||||
logger.info("TEST PASSED: All URLs were processed without crashing.")
|
||||
return True
|
||||
|
||||
# Command-line entry point
|
||||
if __name__ == "__main__":
|
||||
# Parse command line arguments
|
||||
url_count = int(sys.argv[1]) if len(sys.argv) > 1 else 100
|
||||
target_memory = float(sys.argv[2]) if len(sys.argv) > 2 else 85.0
|
||||
|
||||
# Check if stream mode is specified
|
||||
if len(sys.argv) > 3:
|
||||
STREAM = sys.argv[3].lower() in ('true', 'yes', '1', 'stream')
|
||||
|
||||
# Check if aggressive mode is specified
|
||||
aggressive = False
|
||||
if len(sys.argv) > 4:
|
||||
aggressive = sys.argv[4].lower() in ('true', 'yes', '1', 'aggressive')
|
||||
|
||||
print(f"Starting test with {url_count} URLs, {target_memory}% memory target")
|
||||
print(f"Stream mode: {STREAM}, Aggressive: {aggressive}")
|
||||
print("Logs will be written to the logs directory")
|
||||
print("Live display starting now...")
|
||||
|
||||
# Run the test
|
||||
result = asyncio.run(run_memory_stress_test(
|
||||
url_count=url_count,
|
||||
target_memory_percent=target_memory,
|
||||
aggressive=aggressive
|
||||
))
|
||||
|
||||
# Exit with status code
|
||||
sys.exit(0 if result else 1)
|
||||
36
tests/memory/test_docker_config_gen.py
Normal file
36
tests/memory/test_docker_config_gen.py
Normal file
@@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick sanity‑check for /config/dump endpoint.
|
||||
|
||||
Usage:
|
||||
python test_config_dump.py [http://localhost:8020]
|
||||
|
||||
If the server isn’t running, start it first:
|
||||
uvicorn deploy.docker.server:app --port 8020
|
||||
"""
|
||||
|
||||
import sys, json, textwrap, requests
|
||||
|
||||
# BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
|
||||
BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
|
||||
URL = f"{BASE.rstrip('/')}/config/dump"
|
||||
|
||||
CASES = [
|
||||
# --- CrawlRunConfig variants ---
|
||||
"CrawlerRunConfig()",
|
||||
"CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)",
|
||||
"CrawlerRunConfig(js_only=True, wait_until='networkidle')",
|
||||
|
||||
# --- BrowserConfig variants ---
|
||||
"BrowserConfig()",
|
||||
"BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
|
||||
"BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')",
|
||||
]
|
||||
|
||||
for code in CASES:
|
||||
print("\n=== POST:", code)
|
||||
resp = requests.post(URL, json={"code": code}, timeout=15)
|
||||
if resp.ok:
|
||||
print(json.dumps(resp.json(), indent=2)[:400] + "...")
|
||||
else:
|
||||
print("ERROR", resp.status_code, resp.text[:200])
|
||||
520
tests/memory/test_stress_api.py
Normal file
520
tests/memory/test_stress_api.py
Normal file
@@ -0,0 +1,520 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Stress test for Crawl4AI's Docker API server (/crawl and /crawl/stream endpoints).
|
||||
|
||||
This version targets a running Crawl4AI API server, sending concurrent requests
|
||||
to test its ability to handle multiple crawl jobs simultaneously.
|
||||
It uses httpx for async HTTP requests and logs results per batch of requests,
|
||||
including server-side memory usage reported by the API.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
import uuid
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
from typing import List, Dict, Optional, Union, AsyncGenerator, Tuple
|
||||
import httpx
|
||||
import pathlib # Import pathlib explicitly
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.syntax import Syntax
|
||||
|
||||
# --- Constants ---
|
||||
DEFAULT_API_URL = "http://localhost:11235" # Default port
|
||||
DEFAULT_API_URL = "http://localhost:8020" # Default port
|
||||
DEFAULT_URL_COUNT = 100
|
||||
DEFAULT_MAX_CONCURRENT_REQUESTS = 1
|
||||
DEFAULT_CHUNK_SIZE = 10
|
||||
DEFAULT_REPORT_PATH = "reports_api"
|
||||
DEFAULT_STREAM_MODE = True
|
||||
REQUEST_TIMEOUT = 180.0
|
||||
|
||||
# Initialize Rich console
|
||||
console = Console()
|
||||
|
||||
# --- API Health Check (Unchanged) ---
|
||||
async def check_server_health(client: httpx.AsyncClient, health_endpoint: str = "/health"):
|
||||
"""Check if the API server is healthy."""
|
||||
console.print(f"[bold cyan]Checking API server health at {client.base_url}{health_endpoint}...[/]", end="")
|
||||
try:
|
||||
response = await client.get(health_endpoint, timeout=10.0)
|
||||
response.raise_for_status()
|
||||
health_data = response.json()
|
||||
version = health_data.get('version', 'N/A')
|
||||
console.print(f"[bold green] Server OK! Version: {version}[/]")
|
||||
return True
|
||||
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
||||
console.print(f"\n[bold red]Server health check FAILED:[/]")
|
||||
console.print(f"Error: {e}")
|
||||
console.print(f"Is the server running and accessible at {client.base_url}?")
|
||||
return False
|
||||
except Exception as e:
|
||||
console.print(f"\n[bold red]An unexpected error occurred during health check:[/]")
|
||||
console.print(e)
|
||||
return False
|
||||
|
||||
# --- API Stress Test Class ---
|
||||
class ApiStressTest:
|
||||
"""Orchestrates the stress test by sending concurrent requests to the API."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_url: str,
|
||||
url_count: int,
|
||||
max_concurrent_requests: int,
|
||||
chunk_size: int,
|
||||
report_path: str,
|
||||
stream_mode: bool,
|
||||
):
|
||||
self.api_base_url = api_url.rstrip('/')
|
||||
self.url_count = url_count
|
||||
self.max_concurrent_requests = max_concurrent_requests
|
||||
self.chunk_size = chunk_size
|
||||
self.report_path = pathlib.Path(report_path)
|
||||
self.report_path.mkdir(parents=True, exist_ok=True)
|
||||
self.stream_mode = stream_mode
|
||||
|
||||
# Ignore repo path and set it to current file path
|
||||
self.repo_path = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
|
||||
self.test_id = time.strftime("%Y%m%d_%H%M%S")
|
||||
self.results_summary = {
|
||||
"test_id": self.test_id, "api_url": api_url, "url_count": url_count,
|
||||
"max_concurrent_requests": max_concurrent_requests, "chunk_size": chunk_size,
|
||||
"stream_mode": stream_mode, "start_time": "", "end_time": "",
|
||||
"total_time_seconds": 0, "successful_requests": 0, "failed_requests": 0,
|
||||
"successful_urls": 0, "failed_urls": 0, "total_urls_processed": 0,
|
||||
"total_api_calls": 0,
|
||||
"server_memory_metrics": { # To store aggregated server memory info
|
||||
"batch_mode_avg_delta_mb": None,
|
||||
"batch_mode_max_delta_mb": None,
|
||||
"stream_mode_avg_max_snapshot_mb": None,
|
||||
"stream_mode_max_max_snapshot_mb": None,
|
||||
"samples": [] # Store individual request memory results
|
||||
}
|
||||
}
|
||||
self.http_client = httpx.AsyncClient(base_url=self.api_base_url, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=max_concurrent_requests + 5, max_keepalive_connections=max_concurrent_requests))
|
||||
|
||||
async def close_client(self):
|
||||
"""Close the httpx client."""
|
||||
await self.http_client.aclose()
|
||||
|
||||
async def run(self) -> Dict:
|
||||
"""Run the API stress test."""
|
||||
# No client memory tracker needed
|
||||
urls_to_process = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(self.url_count)]
|
||||
url_chunks = [urls_to_process[i:i+self.chunk_size] for i in range(0, len(urls_to_process), self.chunk_size)]
|
||||
|
||||
self.results_summary["start_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
start_time = time.time()
|
||||
|
||||
console.print(f"\n[bold cyan]Crawl4AI API Stress Test - {self.url_count} URLs, {self.max_concurrent_requests} concurrent requests[/bold cyan]")
|
||||
console.print(f"[bold cyan]Target API:[/bold cyan] {self.api_base_url}, [bold cyan]Mode:[/bold cyan] {'Streaming' if self.stream_mode else 'Batch'}, [bold cyan]URLs per Request:[/bold cyan] {self.chunk_size}")
|
||||
# Removed client memory log
|
||||
|
||||
semaphore = asyncio.Semaphore(self.max_concurrent_requests)
|
||||
|
||||
# Updated Batch logging header
|
||||
console.print("\n[bold]API Request Batch Progress:[/bold]")
|
||||
# Adjusted spacing and added Peak
|
||||
console.print("[bold] Batch | Progress | SrvMem Peak / Δ|Max (MB) | Reqs/sec | S/F URLs | Time (s) | Status [/bold]")
|
||||
# Adjust separator length if needed, looks okay for now
|
||||
console.print("─" * 95)
|
||||
|
||||
# No client memory monitor task needed
|
||||
|
||||
tasks = []
|
||||
total_api_calls = len(url_chunks)
|
||||
self.results_summary["total_api_calls"] = total_api_calls
|
||||
|
||||
try:
|
||||
for i, chunk in enumerate(url_chunks):
|
||||
task = asyncio.create_task(self._make_api_request(
|
||||
chunk=chunk,
|
||||
batch_idx=i + 1,
|
||||
total_batches=total_api_calls,
|
||||
semaphore=semaphore
|
||||
# No memory tracker passed
|
||||
))
|
||||
tasks.append(task)
|
||||
|
||||
api_results = await asyncio.gather(*tasks)
|
||||
|
||||
# Process aggregated results including server memory
|
||||
total_successful_requests = sum(1 for r in api_results if r['request_success'])
|
||||
total_failed_requests = total_api_calls - total_successful_requests
|
||||
total_successful_urls = sum(r['success_urls'] for r in api_results)
|
||||
total_failed_urls = sum(r['failed_urls'] for r in api_results)
|
||||
total_urls_processed = total_successful_urls + total_failed_urls
|
||||
|
||||
# Aggregate server memory metrics
|
||||
valid_samples = [r for r in api_results if r.get('server_delta_or_max_mb') is not None] # Filter results with valid mem data
|
||||
self.results_summary["server_memory_metrics"]["samples"] = valid_samples # Store raw samples with both peak and delta/max
|
||||
|
||||
if valid_samples:
|
||||
delta_or_max_values = [r['server_delta_or_max_mb'] for r in valid_samples]
|
||||
if self.stream_mode:
|
||||
# Stream mode: delta_or_max holds max snapshot
|
||||
self.results_summary["server_memory_metrics"]["stream_mode_avg_max_snapshot_mb"] = sum(delta_or_max_values) / len(delta_or_max_values)
|
||||
self.results_summary["server_memory_metrics"]["stream_mode_max_max_snapshot_mb"] = max(delta_or_max_values)
|
||||
else: # Batch mode
|
||||
# delta_or_max holds delta
|
||||
self.results_summary["server_memory_metrics"]["batch_mode_avg_delta_mb"] = sum(delta_or_max_values) / len(delta_or_max_values)
|
||||
self.results_summary["server_memory_metrics"]["batch_mode_max_delta_mb"] = max(delta_or_max_values)
|
||||
|
||||
# Aggregate peak values for batch mode
|
||||
peak_values = [r['server_peak_memory_mb'] for r in valid_samples if r.get('server_peak_memory_mb') is not None]
|
||||
if peak_values:
|
||||
self.results_summary["server_memory_metrics"]["batch_mode_avg_peak_mb"] = sum(peak_values) / len(peak_values)
|
||||
self.results_summary["server_memory_metrics"]["batch_mode_max_peak_mb"] = max(peak_values)
|
||||
|
||||
|
||||
self.results_summary.update({
|
||||
"successful_requests": total_successful_requests,
|
||||
"failed_requests": total_failed_requests,
|
||||
"successful_urls": total_successful_urls,
|
||||
"failed_urls": total_failed_urls,
|
||||
"total_urls_processed": total_urls_processed,
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]An error occurred during task execution: {e}[/bold red]")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
# No finally block needed for monitor task
|
||||
|
||||
end_time = time.time()
|
||||
self.results_summary.update({
|
||||
"end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"total_time_seconds": end_time - start_time,
|
||||
# No client memory report
|
||||
})
|
||||
self._save_results()
|
||||
return self.results_summary
|
||||
|
||||
async def _make_api_request(
|
||||
self,
|
||||
chunk: List[str],
|
||||
batch_idx: int,
|
||||
total_batches: int,
|
||||
semaphore: asyncio.Semaphore
|
||||
# No memory tracker
|
||||
) -> Dict:
|
||||
"""Makes a single API request for a chunk of URLs, handling concurrency and logging server memory."""
|
||||
request_success = False
|
||||
success_urls = 0
|
||||
failed_urls = 0
|
||||
status = "Pending"
|
||||
status_color = "grey"
|
||||
server_memory_metric = None # Store delta (batch) or max snapshot (stream)
|
||||
api_call_start_time = time.time()
|
||||
|
||||
async with semaphore:
|
||||
try:
|
||||
# No client memory sampling
|
||||
|
||||
endpoint = "/crawl/stream" if self.stream_mode else "/crawl"
|
||||
payload = {
|
||||
"urls": chunk,
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {"cache_mode": "BYPASS", "stream": self.stream_mode}
|
||||
}
|
||||
}
|
||||
|
||||
if self.stream_mode:
|
||||
max_server_mem_snapshot = 0.0 # Track max memory seen in this stream
|
||||
async with self.http_client.stream("POST", endpoint, json=payload) as response:
|
||||
initial_status_code = response.status_code
|
||||
response.raise_for_status()
|
||||
|
||||
completed_marker_received = False
|
||||
async for line in response.aiter_lines():
|
||||
if line:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get("status") == "completed":
|
||||
completed_marker_received = True
|
||||
break
|
||||
elif data.get("url"):
|
||||
if data.get("success"): success_urls += 1
|
||||
else: failed_urls += 1
|
||||
# Extract server memory snapshot per result
|
||||
mem_snapshot = data.get('server_memory_mb')
|
||||
if mem_snapshot is not None:
|
||||
max_server_mem_snapshot = max(max_server_mem_snapshot, float(mem_snapshot))
|
||||
except json.JSONDecodeError:
|
||||
console.print(f"[Batch {batch_idx}] [red]Stream decode error for line:[/red] {line}")
|
||||
failed_urls = len(chunk)
|
||||
break
|
||||
request_success = completed_marker_received
|
||||
if not request_success:
|
||||
failed_urls = len(chunk) - success_urls
|
||||
server_memory_metric = max_server_mem_snapshot # Use max snapshot for stream logging
|
||||
|
||||
else: # Batch mode
|
||||
response = await self.http_client.post(endpoint, json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Extract server memory delta from the response
|
||||
server_memory_metric = data.get('server_memory_delta_mb')
|
||||
server_peak_mem_mb = data.get('server_peak_memory_mb')
|
||||
|
||||
if data.get("success") and "results" in data:
|
||||
request_success = True
|
||||
results_list = data.get("results", [])
|
||||
for result_item in results_list:
|
||||
if result_item.get("success"): success_urls += 1
|
||||
else: failed_urls += 1
|
||||
if len(results_list) != len(chunk):
|
||||
console.print(f"[Batch {batch_idx}] [yellow]Warning: Result count ({len(results_list)}) doesn't match URL count ({len(chunk)})[/yellow]")
|
||||
failed_urls = len(chunk) - success_urls
|
||||
else:
|
||||
request_success = False
|
||||
failed_urls = len(chunk)
|
||||
# Try to get memory from error detail if available
|
||||
detail = data.get('detail')
|
||||
if isinstance(detail, str):
|
||||
try: detail_json = json.loads(detail)
|
||||
except: detail_json = {}
|
||||
elif isinstance(detail, dict):
|
||||
detail_json = detail
|
||||
else: detail_json = {}
|
||||
server_peak_mem_mb = detail_json.get('server_peak_memory_mb', None)
|
||||
server_memory_metric = detail_json.get('server_memory_delta_mb', None)
|
||||
console.print(f"[Batch {batch_idx}] [red]API request failed:[/red] {detail_json.get('error', 'No details')}")
|
||||
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
request_success = False
|
||||
failed_urls = len(chunk)
|
||||
console.print(f"[Batch {batch_idx}] [bold red]HTTP Error {e.response.status_code}:[/] {e.request.url}")
|
||||
try:
|
||||
error_detail = e.response.json()
|
||||
# Attempt to extract memory info even from error responses
|
||||
detail_content = error_detail.get('detail', {})
|
||||
if isinstance(detail_content, str): # Handle if detail is stringified JSON
|
||||
try: detail_content = json.loads(detail_content)
|
||||
except: detail_content = {}
|
||||
server_memory_metric = detail_content.get('server_memory_delta_mb', None)
|
||||
server_peak_mem_mb = detail_content.get('server_peak_memory_mb', None)
|
||||
console.print(f"Response: {error_detail}")
|
||||
except Exception:
|
||||
console.print(f"Response Text: {e.response.text[:200]}...")
|
||||
except httpx.RequestError as e:
|
||||
request_success = False
|
||||
failed_urls = len(chunk)
|
||||
console.print(f"[Batch {batch_idx}] [bold red]Request Error:[/bold] {e.request.url} - {e}")
|
||||
except Exception as e:
|
||||
request_success = False
|
||||
failed_urls = len(chunk)
|
||||
console.print(f"[Batch {batch_idx}] [bold red]Unexpected Error:[/bold] {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
finally:
|
||||
api_call_time = time.time() - api_call_start_time
|
||||
total_processed_urls = success_urls + failed_urls
|
||||
|
||||
if request_success and failed_urls == 0: status_color, status = "green", "Success"
|
||||
elif request_success and success_urls > 0: status_color, status = "yellow", "Partial"
|
||||
else: status_color, status = "red", "Failed"
|
||||
|
||||
current_total_urls = batch_idx * self.chunk_size
|
||||
progress_pct = min(100.0, (current_total_urls / self.url_count) * 100)
|
||||
reqs_per_sec = 1.0 / api_call_time if api_call_time > 0 else float('inf')
|
||||
|
||||
# --- New Memory Formatting ---
|
||||
mem_display = " N/A " # Default
|
||||
peak_mem_value = None
|
||||
delta_or_max_value = None
|
||||
|
||||
if self.stream_mode:
|
||||
# server_memory_metric holds max snapshot for stream
|
||||
if server_memory_metric is not None:
|
||||
mem_display = f"{server_memory_metric:.1f} (Max)"
|
||||
delta_or_max_value = server_memory_metric # Store for aggregation
|
||||
else: # Batch mode - expect peak and delta
|
||||
# We need to get peak and delta from the API response
|
||||
peak_mem_value = locals().get('server_peak_mem_mb', None) # Get from response data if available
|
||||
delta_value = server_memory_metric # server_memory_metric holds delta for batch
|
||||
|
||||
if peak_mem_value is not None and delta_value is not None:
|
||||
mem_display = f"{peak_mem_value:.1f} / {delta_value:+.1f}"
|
||||
delta_or_max_value = delta_value # Store delta for aggregation
|
||||
elif peak_mem_value is not None:
|
||||
mem_display = f"{peak_mem_value:.1f} / N/A"
|
||||
elif delta_value is not None:
|
||||
mem_display = f"N/A / {delta_value:+.1f}"
|
||||
delta_or_max_value = delta_value # Store delta for aggregation
|
||||
|
||||
# --- Updated Print Statement with Adjusted Padding ---
|
||||
console.print(
|
||||
f" {batch_idx:<5} | {progress_pct:6.1f}% | {mem_display:>24} | {reqs_per_sec:8.1f} | " # Increased width for memory column
|
||||
f"{success_urls:^7}/{failed_urls:<6} | {api_call_time:8.2f} | [{status_color}]{status:<7}[/{status_color}] " # Added trailing space
|
||||
)
|
||||
|
||||
# --- Updated Return Dictionary ---
|
||||
return_data = {
|
||||
"batch_idx": batch_idx,
|
||||
"request_success": request_success,
|
||||
"success_urls": success_urls,
|
||||
"failed_urls": failed_urls,
|
||||
"time": api_call_time,
|
||||
# Return both peak (if available) and delta/max
|
||||
"server_peak_memory_mb": peak_mem_value, # Will be None for stream mode
|
||||
"server_delta_or_max_mb": delta_or_max_value # Delta for batch, Max for stream
|
||||
}
|
||||
# Add back the specific batch mode delta if needed elsewhere, but delta_or_max covers it
|
||||
# if not self.stream_mode:
|
||||
# return_data["server_memory_delta_mb"] = delta_value
|
||||
return return_data
|
||||
|
||||
# No _periodic_memory_sample needed
|
||||
|
||||
def _save_results(self) -> None:
|
||||
"""Saves the results summary to a JSON file."""
|
||||
results_path = self.report_path / f"api_test_summary_{self.test_id}.json"
|
||||
try:
|
||||
# No client memory path to convert
|
||||
with open(results_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.results_summary, f, indent=2, default=str)
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Failed to save results summary: {e}[/bold red]")
|
||||
|
||||
|
||||
# --- run_full_test Function ---
|
||||
async def run_full_test(args):
|
||||
"""Runs the full API stress test process."""
|
||||
client = httpx.AsyncClient(base_url=args.api_url, timeout=REQUEST_TIMEOUT)
|
||||
|
||||
if not await check_server_health(client):
|
||||
console.print("[bold red]Aborting test due to server health check failure.[/]")
|
||||
await client.aclose()
|
||||
return
|
||||
await client.aclose()
|
||||
|
||||
test = ApiStressTest(
|
||||
api_url=args.api_url,
|
||||
url_count=args.urls,
|
||||
max_concurrent_requests=args.max_concurrent_requests,
|
||||
chunk_size=args.chunk_size,
|
||||
report_path=args.report_path,
|
||||
stream_mode=args.stream,
|
||||
)
|
||||
results = {}
|
||||
try:
|
||||
results = await test.run()
|
||||
finally:
|
||||
await test.close_client()
|
||||
|
||||
if not results:
|
||||
console.print("[bold red]Test did not produce results.[/bold red]")
|
||||
return
|
||||
|
||||
console.print("\n" + "=" * 80)
|
||||
console.print("[bold green]API Stress Test Completed[/bold green]")
|
||||
console.print("=" * 80)
|
||||
|
||||
success_rate_reqs = results["successful_requests"] / results["total_api_calls"] * 100 if results["total_api_calls"] > 0 else 0
|
||||
success_rate_urls = results["successful_urls"] / results["url_count"] * 100 if results["url_count"] > 0 else 0
|
||||
urls_per_second = results["total_urls_processed"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
|
||||
reqs_per_second = results["total_api_calls"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
|
||||
|
||||
|
||||
console.print(f"[bold cyan]Test ID:[/bold cyan] {results['test_id']}")
|
||||
console.print(f"[bold cyan]Target API:[/bold cyan] {results['api_url']}")
|
||||
console.print(f"[bold cyan]Configuration:[/bold cyan] {results['url_count']} URLs, {results['max_concurrent_requests']} concurrent client requests, URLs/Req: {results['chunk_size']}, Stream: {results['stream_mode']}")
|
||||
console.print(f"[bold cyan]API Requests:[/bold cyan] {results['successful_requests']} successful, {results['failed_requests']} failed ({results['total_api_calls']} total, {success_rate_reqs:.1f}% success)")
|
||||
console.print(f"[bold cyan]URL Processing:[/bold cyan] {results['successful_urls']} successful, {results['failed_urls']} failed ({results['total_urls_processed']} processed, {success_rate_urls:.1f}% success)")
|
||||
console.print(f"[bold cyan]Performance:[/bold cyan] {results['total_time_seconds']:.2f}s total | Avg Reqs/sec: {reqs_per_second:.2f} | Avg URLs/sec: {urls_per_second:.2f}")
|
||||
|
||||
# Report Server Memory
|
||||
mem_metrics = results.get("server_memory_metrics", {})
|
||||
mem_samples = mem_metrics.get("samples", [])
|
||||
if mem_samples:
|
||||
num_samples = len(mem_samples)
|
||||
if results['stream_mode']:
|
||||
avg_mem = mem_metrics.get("stream_mode_avg_max_snapshot_mb")
|
||||
max_mem = mem_metrics.get("stream_mode_max_max_snapshot_mb")
|
||||
avg_str = f"{avg_mem:.1f}" if avg_mem is not None else "N/A"
|
||||
max_str = f"{max_mem:.1f}" if max_mem is not None else "N/A"
|
||||
console.print(f"[bold cyan]Server Memory (Stream):[/bold cyan] Avg Max Snapshot: {avg_str} MB | Max Max Snapshot: {max_str} MB (across {num_samples} requests)")
|
||||
else: # Batch mode
|
||||
avg_delta = mem_metrics.get("batch_mode_avg_delta_mb")
|
||||
max_delta = mem_metrics.get("batch_mode_max_delta_mb")
|
||||
avg_peak = mem_metrics.get("batch_mode_avg_peak_mb")
|
||||
max_peak = mem_metrics.get("batch_mode_max_peak_mb")
|
||||
|
||||
avg_delta_str = f"{avg_delta:.1f}" if avg_delta is not None else "N/A"
|
||||
max_delta_str = f"{max_delta:.1f}" if max_delta is not None else "N/A"
|
||||
avg_peak_str = f"{avg_peak:.1f}" if avg_peak is not None else "N/A"
|
||||
max_peak_str = f"{max_peak:.1f}" if max_peak is not None else "N/A"
|
||||
|
||||
console.print(f"[bold cyan]Server Memory (Batch):[/bold cyan] Avg Peak: {avg_peak_str} MB | Max Peak: {max_peak_str} MB | Avg Delta: {avg_delta_str} MB | Max Delta: {max_delta_str} MB (across {num_samples} requests)")
|
||||
else:
|
||||
console.print("[bold cyan]Server Memory:[/bold cyan] No memory data reported by server.")
|
||||
|
||||
|
||||
# No client memory report
|
||||
summary_path = pathlib.Path(args.report_path) / f"api_test_summary_{results['test_id']}.json"
|
||||
console.print(f"[bold green]Results summary saved to {summary_path}[/bold green]")
|
||||
|
||||
if results["failed_requests"] > 0:
|
||||
console.print(f"\n[bold yellow]Warning: {results['failed_requests']} API requests failed ({100-success_rate_reqs:.1f}% failure rate)[/bold yellow]")
|
||||
if results["failed_urls"] > 0:
|
||||
console.print(f"[bold yellow]Warning: {results['failed_urls']} URLs failed to process ({100-success_rate_urls:.1f}% URL failure rate)[/bold yellow]")
|
||||
if results["total_urls_processed"] < results["url_count"]:
|
||||
console.print(f"\n[bold red]Error: Only {results['total_urls_processed']} out of {results['url_count']} target URLs were processed![/bold red]")
|
||||
|
||||
|
||||
# --- main Function (Argument parsing mostly unchanged) ---
|
||||
def main():
|
||||
"""Main entry point for the script."""
|
||||
parser = argparse.ArgumentParser(description="Crawl4AI API Server Stress Test")
|
||||
|
||||
parser.add_argument("--api-url", type=str, default=DEFAULT_API_URL, help=f"Base URL of the Crawl4AI API server (default: {DEFAULT_API_URL})")
|
||||
parser.add_argument("--urls", type=int, default=DEFAULT_URL_COUNT, help=f"Total number of unique URLs to process via API calls (default: {DEFAULT_URL_COUNT})")
|
||||
parser.add_argument("--max-concurrent-requests", type=int, default=DEFAULT_MAX_CONCURRENT_REQUESTS, help=f"Maximum concurrent API requests from this client (default: {DEFAULT_MAX_CONCURRENT_REQUESTS})")
|
||||
parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help=f"Number of URLs per API request payload (default: {DEFAULT_CHUNK_SIZE})")
|
||||
parser.add_argument("--stream", action="store_true", default=DEFAULT_STREAM_MODE, help=f"Use the /crawl/stream endpoint instead of /crawl (default: {DEFAULT_STREAM_MODE})")
|
||||
parser.add_argument("--report-path", type=str, default=DEFAULT_REPORT_PATH, help=f"Path to save reports and logs (default: {DEFAULT_REPORT_PATH})")
|
||||
parser.add_argument("--clean-reports", action="store_true", help="Clean up report directory before running")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
console.print("[bold underline]Crawl4AI API Stress Test Configuration[/bold underline]")
|
||||
console.print(f"API URL: {args.api_url}")
|
||||
console.print(f"Total URLs: {args.urls}, Concurrent Client Requests: {args.max_concurrent_requests}, URLs per Request: {args.chunk_size}")
|
||||
console.print(f"Mode: {'Streaming' if args.stream else 'Batch'}")
|
||||
console.print(f"Report Path: {args.report_path}")
|
||||
console.print("-" * 40)
|
||||
if args.clean_reports: console.print("[cyan]Option: Clean reports before test[/cyan]")
|
||||
console.print("-" * 40)
|
||||
|
||||
if args.clean_reports:
|
||||
report_dir = pathlib.Path(args.report_path)
|
||||
if report_dir.exists():
|
||||
console.print(f"[yellow]Cleaning up reports directory: {args.report_path}[/yellow]")
|
||||
shutil.rmtree(args.report_path)
|
||||
report_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
asyncio.run(run_full_test(args))
|
||||
except KeyboardInterrupt:
|
||||
console.print("\n[bold yellow]Test interrupted by user.[/bold yellow]")
|
||||
except Exception as e:
|
||||
console.print(f"\n[bold red]An unexpected error occurred:[/bold red] {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# No need to modify sys.path for SimpleMemoryTracker as it's removed
|
||||
main()
|
||||
203
tests/memory/test_stress_api_xs.py
Normal file
203
tests/memory/test_stress_api_xs.py
Normal file
@@ -0,0 +1,203 @@
|
||||
"""Lite Crawl4AI API stress‑tester.
|
||||
|
||||
✔ batch or stream mode (single unified path)
|
||||
✔ global stats + JSON summary
|
||||
✔ rich table progress
|
||||
✔ Typer CLI with presets (quick / soak)
|
||||
|
||||
Usage examples:
|
||||
python api_stress_test.py # uses quick preset
|
||||
python api_stress_test.py soak # 5 K URLs stress run
|
||||
python api_stress_test.py --urls 200 --concurrent 10 --chunk 20
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio, json, time, uuid, pathlib, statistics
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import httpx, typer
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
# ───────────────────────── defaults / presets ──────────────────────────
|
||||
PRESETS = {
|
||||
"quick": dict(urls=1, concurrent=1, chunk=1, stream=False),
|
||||
"debug": dict(urls=10, concurrent=2, chunk=5, stream=False),
|
||||
"soak": dict(urls=5000, concurrent=20, chunk=50, stream=True),
|
||||
}
|
||||
|
||||
API_HEALTH_ENDPOINT = "/health"
|
||||
REQUEST_TIMEOUT = 180.0
|
||||
|
||||
console = Console()
|
||||
app = typer.Typer(add_completion=False, rich_markup_mode="rich")
|
||||
|
||||
# ───────────────────────── helpers ─────────────────────────────────────
|
||||
async def _check_health(client: httpx.AsyncClient) -> None:
|
||||
resp = await client.get(API_HEALTH_ENDPOINT, timeout=10)
|
||||
resp.raise_for_status()
|
||||
console.print(f"[green]Server healthy — version {resp.json().get('version','?')}[/]")
|
||||
|
||||
async def _iter_results(resp: httpx.Response, stream: bool):
|
||||
"""Yield result dicts from batch JSON or ND‑JSON stream."""
|
||||
if stream:
|
||||
async for line in resp.aiter_lines():
|
||||
if not line:
|
||||
continue
|
||||
rec = json.loads(line)
|
||||
if rec.get("status") == "completed":
|
||||
break
|
||||
yield rec
|
||||
else:
|
||||
data = resp.json()
|
||||
for rec in data.get("results", []):
|
||||
yield rec, data # rec + whole payload for memory delta/peak
|
||||
|
||||
async def _consume_stream(resp: httpx.Response) -> Dict:
|
||||
stats = {"success_urls": 0, "failed_urls": 0, "mem_metric": 0.0}
|
||||
async for line in resp.aiter_lines():
|
||||
if not line:
|
||||
continue
|
||||
rec = json.loads(line)
|
||||
if rec.get("status") == "completed":
|
||||
break
|
||||
if rec.get("success"):
|
||||
stats["success_urls"] += 1
|
||||
else:
|
||||
stats["failed_urls"] += 1
|
||||
mem = rec.get("server_memory_mb")
|
||||
if mem is not None:
|
||||
stats["mem_metric"] = max(stats["mem_metric"], float(mem))
|
||||
return stats
|
||||
|
||||
def _consume_batch(body: Dict) -> Dict:
|
||||
stats = {"success_urls": 0, "failed_urls": 0}
|
||||
for rec in body.get("results", []):
|
||||
if rec.get("success"):
|
||||
stats["success_urls"] += 1
|
||||
else:
|
||||
stats["failed_urls"] += 1
|
||||
stats["mem_metric"] = body.get("server_memory_delta_mb")
|
||||
stats["peak"] = body.get("server_peak_memory_mb")
|
||||
return stats
|
||||
|
||||
async def _fetch_chunk(
|
||||
client: httpx.AsyncClient,
|
||||
urls: List[str],
|
||||
stream: bool,
|
||||
semaphore: asyncio.Semaphore,
|
||||
) -> Dict:
|
||||
endpoint = "/crawl/stream" if stream else "/crawl"
|
||||
payload = {
|
||||
"urls": urls,
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {"type": "CrawlerRunConfig",
|
||||
"params": {"cache_mode": "BYPASS", "stream": stream}},
|
||||
}
|
||||
|
||||
async with semaphore:
|
||||
start = time.perf_counter()
|
||||
|
||||
if stream:
|
||||
# ---- streaming request ----
|
||||
async with client.stream("POST", endpoint, json=payload) as resp:
|
||||
resp.raise_for_status()
|
||||
stats = await _consume_stream(resp)
|
||||
else:
|
||||
# ---- batch request ----
|
||||
resp = await client.post(endpoint, json=payload)
|
||||
resp.raise_for_status()
|
||||
stats = _consume_batch(resp.json())
|
||||
|
||||
stats["elapsed"] = time.perf_counter() - start
|
||||
return stats
|
||||
|
||||
|
||||
# ───────────────────────── core runner ─────────────────────────────────
|
||||
async def _run(api: str, urls: int, concurrent: int, chunk: int, stream: bool, report: pathlib.Path):
|
||||
client = httpx.AsyncClient(base_url=api, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=concurrent+5))
|
||||
await _check_health(client)
|
||||
|
||||
url_list = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(urls)]
|
||||
chunks = [url_list[i:i+chunk] for i in range(0, len(url_list), chunk)]
|
||||
sem = asyncio.Semaphore(concurrent)
|
||||
|
||||
table = Table(show_header=True, header_style="bold magenta")
|
||||
table.add_column("Batch", style="dim", width=6)
|
||||
table.add_column("Success/Fail", width=12)
|
||||
table.add_column("Mem", width=14)
|
||||
table.add_column("Time (s)")
|
||||
|
||||
agg_success = agg_fail = 0
|
||||
deltas, peaks = [], []
|
||||
|
||||
start = time.perf_counter()
|
||||
tasks = [asyncio.create_task(_fetch_chunk(client, c, stream, sem)) for c in chunks]
|
||||
for idx, coro in enumerate(asyncio.as_completed(tasks), 1):
|
||||
res = await coro
|
||||
agg_success += res["success_urls"]
|
||||
agg_fail += res["failed_urls"]
|
||||
if res["mem_metric"] is not None:
|
||||
deltas.append(res["mem_metric"])
|
||||
if res["peak"] is not None:
|
||||
peaks.append(res["peak"])
|
||||
|
||||
mem_txt = f"{res['mem_metric']:.1f}" if res["mem_metric"] is not None else "‑"
|
||||
if res["peak"] is not None:
|
||||
mem_txt = f"{res['peak']:.1f}/{mem_txt}"
|
||||
|
||||
table.add_row(str(idx), f"{res['success_urls']}/{res['failed_urls']}", mem_txt, f"{res['elapsed']:.2f}")
|
||||
|
||||
console.print(table)
|
||||
total_time = time.perf_counter() - start
|
||||
|
||||
summary = {
|
||||
"urls": urls,
|
||||
"concurrent": concurrent,
|
||||
"chunk": chunk,
|
||||
"stream": stream,
|
||||
"success_urls": agg_success,
|
||||
"failed_urls": agg_fail,
|
||||
"elapsed_sec": round(total_time, 2),
|
||||
"avg_mem": round(statistics.mean(deltas), 2) if deltas else None,
|
||||
"max_mem": max(deltas) if deltas else None,
|
||||
"avg_peak": round(statistics.mean(peaks), 2) if peaks else None,
|
||||
"max_peak": max(peaks) if peaks else None,
|
||||
}
|
||||
console.print("\n[bold green]Done:[/]" , summary)
|
||||
|
||||
report.mkdir(parents=True, exist_ok=True)
|
||||
path = report / f"api_test_{int(time.time())}.json"
|
||||
path.write_text(json.dumps(summary, indent=2))
|
||||
console.print(f"[green]Summary → {path}")
|
||||
|
||||
await client.aclose()
|
||||
|
||||
# ───────────────────────── Typer CLI ──────────────────────────────────
|
||||
@app.command()
|
||||
def main(
|
||||
preset: str = typer.Argument("quick", help="quick / debug / soak or custom"),
|
||||
api_url: str = typer.Option("http://localhost:8020", show_default=True),
|
||||
urls: int = typer.Option(None, help="Total URLs to crawl"),
|
||||
concurrent: int = typer.Option(None, help="Concurrent API requests"),
|
||||
chunk: int = typer.Option(None, help="URLs per request"),
|
||||
stream: bool = typer.Option(None, help="Use /crawl/stream"),
|
||||
report: pathlib.Path = typer.Option("reports_api", help="Where to save JSON summary"),
|
||||
):
|
||||
"""Run a stress test against a running Crawl4AI API server."""
|
||||
if preset not in PRESETS and any(v is None for v in (urls, concurrent, chunk, stream)):
|
||||
console.print(f"[red]Unknown preset '{preset}' and custom params missing[/]")
|
||||
raise typer.Exit(1)
|
||||
|
||||
cfg = PRESETS.get(preset, {})
|
||||
urls = urls or cfg.get("urls")
|
||||
concurrent = concurrent or cfg.get("concurrent")
|
||||
chunk = chunk or cfg.get("chunk")
|
||||
stream = stream if stream is not None else cfg.get("stream", False)
|
||||
|
||||
console.print(f"[cyan]API:[/] {api_url} | URLs: {urls} | Concurrency: {concurrent} | Chunk: {chunk} | Stream: {stream}")
|
||||
asyncio.run(_run(api_url, urls, concurrent, chunk, stream, report))
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
129
tests/memory/test_stress_docker_api.py
Normal file
129
tests/memory/test_stress_docker_api.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""
|
||||
Crawl4AI Docker API stress tester.
|
||||
|
||||
Examples
|
||||
--------
|
||||
python test_stress_docker_api.py --urls 1000 --concurrency 32
|
||||
python test_stress_docker_api.py --urls 1000 --concurrency 32 --stream
|
||||
python test_stress_docker_api.py --base-url http://10.0.0.42:11235 --http2
|
||||
"""
|
||||
|
||||
import argparse, asyncio, json, secrets, statistics, time
|
||||
from typing import List, Tuple
|
||||
import httpx
|
||||
from rich.console import Console
|
||||
from rich.progress import Progress, BarColumn, TimeElapsedColumn, TimeRemainingColumn
|
||||
from rich.table import Table
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
# ───────────────────────── helpers ─────────────────────────
|
||||
def make_fake_urls(n: int) -> List[str]:
|
||||
base = "https://httpbin.org/anything/"
|
||||
return [f"{base}{secrets.token_hex(8)}" for _ in range(n)]
|
||||
|
||||
|
||||
async def fire(
|
||||
client: httpx.AsyncClient, endpoint: str, payload: dict, sem: asyncio.Semaphore
|
||||
) -> Tuple[bool, float]:
|
||||
async with sem:
|
||||
print(f"POST {endpoint} with {len(payload['urls'])} URLs")
|
||||
t0 = time.perf_counter()
|
||||
try:
|
||||
if endpoint.endswith("/stream"):
|
||||
async with client.stream("POST", endpoint, json=payload) as r:
|
||||
r.raise_for_status()
|
||||
async for _ in r.aiter_lines():
|
||||
pass
|
||||
else:
|
||||
r = await client.post(endpoint, json=payload)
|
||||
r.raise_for_status()
|
||||
return True, time.perf_counter() - t0
|
||||
except Exception:
|
||||
return False, time.perf_counter() - t0
|
||||
|
||||
|
||||
def pct(lat: List[float], p: float) -> str:
|
||||
"""Return percentile string even for tiny samples."""
|
||||
if not lat:
|
||||
return "-"
|
||||
if len(lat) == 1:
|
||||
return f"{lat[0]:.2f}s"
|
||||
lat_sorted = sorted(lat)
|
||||
k = (p / 100) * (len(lat_sorted) - 1)
|
||||
lo = int(k)
|
||||
hi = min(lo + 1, len(lat_sorted) - 1)
|
||||
frac = k - lo
|
||||
val = lat_sorted[lo] * (1 - frac) + lat_sorted[hi] * frac
|
||||
return f"{val:.2f}s"
|
||||
|
||||
|
||||
# ───────────────────────── main ─────────────────────────
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(description="Stress test Crawl4AI Docker API")
|
||||
p.add_argument("--urls", type=int, default=100, help="number of URLs")
|
||||
p.add_argument("--concurrency", type=int, default=1, help="max POSTs in flight")
|
||||
p.add_argument("--chunk-size", type=int, default=50, help="URLs per request")
|
||||
p.add_argument("--base-url", default="http://localhost:11235", help="API root")
|
||||
# p.add_argument("--base-url", default="http://localhost:8020", help="API root")
|
||||
p.add_argument("--stream", action="store_true", help="use /crawl/stream")
|
||||
p.add_argument("--http2", action="store_true", help="enable HTTP/2")
|
||||
p.add_argument("--headless", action="store_true", default=True)
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
args = parse_args()
|
||||
|
||||
urls = make_fake_urls(args.urls)
|
||||
batches = [urls[i : i + args.chunk_size] for i in range(0, len(urls), args.chunk_size)]
|
||||
endpoint = "/crawl/stream" if args.stream else "/crawl"
|
||||
sem = asyncio.Semaphore(args.concurrency)
|
||||
|
||||
async with httpx.AsyncClient(base_url=args.base_url, http2=args.http2, timeout=None) as client:
|
||||
with Progress(
|
||||
"[progress.description]{task.description}",
|
||||
BarColumn(),
|
||||
"[progress.percentage]{task.percentage:>3.0f}%",
|
||||
TimeElapsedColumn(),
|
||||
TimeRemainingColumn(),
|
||||
) as progress:
|
||||
task_id = progress.add_task("[cyan]bombarding…", total=len(batches))
|
||||
tasks = []
|
||||
for chunk in batches:
|
||||
payload = {
|
||||
"urls": chunk,
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": args.headless}},
|
||||
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS", "stream": args.stream}},
|
||||
}
|
||||
tasks.append(asyncio.create_task(fire(client, endpoint, payload, sem)))
|
||||
progress.advance(task_id)
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
ok_latencies = [dt for ok, dt in results if ok]
|
||||
err_count = sum(1 for ok, _ in results if not ok)
|
||||
|
||||
table = Table(title="Docker API Stress‑Test Summary")
|
||||
table.add_column("total", justify="right")
|
||||
table.add_column("errors", justify="right")
|
||||
table.add_column("p50", justify="right")
|
||||
table.add_column("p95", justify="right")
|
||||
table.add_column("max", justify="right")
|
||||
|
||||
table.add_row(
|
||||
str(len(results)),
|
||||
str(err_count),
|
||||
pct(ok_latencies, 50),
|
||||
pct(ok_latencies, 95),
|
||||
f"{max(ok_latencies):.2f}s" if ok_latencies else "-",
|
||||
)
|
||||
console.print(table)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
console.print("\n[yellow]aborted by user[/]")
|
||||
500
tests/memory/test_stress_sdk.py
Normal file
500
tests/memory/test_stress_sdk.py
Normal file
@@ -0,0 +1,500 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Stress test for Crawl4AI's arun_many and dispatcher system.
|
||||
This version uses a local HTTP server and focuses on testing
|
||||
the SDK's ability to handle multiple URLs concurrently, with per-batch logging.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
import pathlib
|
||||
import random
|
||||
import secrets
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import subprocess
|
||||
import signal
|
||||
from typing import List, Dict, Optional, Union, AsyncGenerator
|
||||
import shutil
|
||||
from rich.console import Console
|
||||
|
||||
# Crawl4AI components
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
BrowserConfig,
|
||||
MemoryAdaptiveDispatcher,
|
||||
CrawlerMonitor,
|
||||
DisplayMode,
|
||||
CrawlResult,
|
||||
RateLimiter,
|
||||
CacheMode,
|
||||
)
|
||||
|
||||
# Constants
|
||||
DEFAULT_SITE_PATH = "test_site"
|
||||
DEFAULT_PORT = 8000
|
||||
DEFAULT_MAX_SESSIONS = 16
|
||||
DEFAULT_URL_COUNT = 1
|
||||
DEFAULT_CHUNK_SIZE = 1 # Define chunk size for batch logging
|
||||
DEFAULT_REPORT_PATH = "reports"
|
||||
DEFAULT_STREAM_MODE = False
|
||||
DEFAULT_MONITOR_MODE = "DETAILED"
|
||||
|
||||
# Initialize Rich console
|
||||
console = Console()
|
||||
|
||||
# --- SiteGenerator Class (Unchanged) ---
|
||||
class SiteGenerator:
|
||||
"""Generates a local test site with heavy pages for stress testing."""
|
||||
|
||||
def __init__(self, site_path: str = DEFAULT_SITE_PATH, page_count: int = DEFAULT_URL_COUNT):
|
||||
self.site_path = pathlib.Path(site_path)
|
||||
self.page_count = page_count
|
||||
self.images_dir = self.site_path / "images"
|
||||
self.lorem_words = " ".join("lorem ipsum dolor sit amet " * 100).split()
|
||||
|
||||
self.html_template = """<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Test Page {page_num}</title>
|
||||
<meta charset="utf-8">
|
||||
</head>
|
||||
<body>
|
||||
<h1>Test Page {page_num}</h1>
|
||||
{paragraphs}
|
||||
{images}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
def generate_site(self) -> None:
|
||||
self.site_path.mkdir(parents=True, exist_ok=True)
|
||||
self.images_dir.mkdir(exist_ok=True)
|
||||
console.print(f"Generating {self.page_count} test pages...")
|
||||
for i in range(self.page_count):
|
||||
paragraphs = "\n".join(f"<p>{' '.join(random.choices(self.lorem_words, k=200))}</p>" for _ in range(5))
|
||||
images = "\n".join(f'<img src="https://picsum.photos/seed/{secrets.token_hex(8)}/300/200" loading="lazy" alt="Random image {j}"/>' for j in range(3))
|
||||
page_path = self.site_path / f"page_{i}.html"
|
||||
page_path.write_text(self.html_template.format(page_num=i, paragraphs=paragraphs, images=images), encoding="utf-8")
|
||||
if (i + 1) % (self.page_count // 10 or 1) == 0 or i == self.page_count - 1:
|
||||
console.print(f"Generated {i+1}/{self.page_count} pages")
|
||||
self._create_index_page()
|
||||
console.print(f"[bold green]Successfully generated {self.page_count} test pages in [cyan]{self.site_path}[/cyan][/bold green]")
|
||||
|
||||
def _create_index_page(self) -> None:
|
||||
index_content = """<!doctype html><html><head><title>Test Site Index</title><meta charset="utf-8"></head><body><h1>Test Site Index</h1><p>This is an automatically generated site for testing Crawl4AI.</p><div class="page-links">\n"""
|
||||
for i in range(self.page_count):
|
||||
index_content += f' <a href="page_{i}.html">Test Page {i}</a><br>\n'
|
||||
index_content += """ </div></body></html>"""
|
||||
(self.site_path / "index.html").write_text(index_content, encoding="utf-8")
|
||||
|
||||
# --- LocalHttpServer Class (Unchanged) ---
|
||||
class LocalHttpServer:
|
||||
"""Manages a local HTTP server for serving test pages."""
|
||||
def __init__(self, site_path: str = DEFAULT_SITE_PATH, port: int = DEFAULT_PORT):
|
||||
self.site_path = pathlib.Path(site_path)
|
||||
self.port = port
|
||||
self.process = None
|
||||
|
||||
def start(self) -> None:
|
||||
if not self.site_path.exists(): raise FileNotFoundError(f"Site directory {self.site_path} does not exist")
|
||||
console.print(f"Attempting to start HTTP server in [cyan]{self.site_path}[/cyan] on port {self.port}...")
|
||||
try:
|
||||
cmd = ["python", "-m", "http.server", str(self.port)]
|
||||
creationflags = 0; preexec_fn = None
|
||||
if sys.platform == 'win32': creationflags = subprocess.CREATE_NEW_PROCESS_GROUP
|
||||
self.process = subprocess.Popen(cmd, cwd=str(self.site_path), stdout=subprocess.PIPE, stderr=subprocess.PIPE, creationflags=creationflags)
|
||||
time.sleep(1.5)
|
||||
if self.is_running(): console.print(f"[bold green]HTTP server started successfully (PID: {self.process.pid})[/bold green]")
|
||||
else:
|
||||
console.print("[bold red]Failed to start HTTP server. Checking logs...[/bold red]")
|
||||
stdout, stderr = self.process.communicate(); print(stdout.decode(errors='ignore')); print(stderr.decode(errors='ignore'))
|
||||
self.stop(); raise RuntimeError("HTTP server failed to start.")
|
||||
except Exception as e: console.print(f"[bold red]Error starting HTTP server: {str(e)}[/bold red]"); self.stop(); raise
|
||||
|
||||
def stop(self) -> None:
|
||||
if self.process and self.is_running():
|
||||
console.print(f"Stopping HTTP server (PID: {self.process.pid})...")
|
||||
try:
|
||||
if sys.platform == 'win32': self.process.send_signal(signal.CTRL_BREAK_EVENT); time.sleep(0.5)
|
||||
self.process.terminate()
|
||||
try: stdout, stderr = self.process.communicate(timeout=5); console.print("[bold yellow]HTTP server stopped[/bold yellow]")
|
||||
except subprocess.TimeoutExpired: console.print("[bold red]Server did not terminate gracefully, killing...[/bold red]"); self.process.kill(); stdout, stderr = self.process.communicate(); console.print("[bold yellow]HTTP server killed[/bold yellow]")
|
||||
except Exception as e: console.print(f"[bold red]Error stopping HTTP server: {str(e)}[/bold red]"); self.process.kill()
|
||||
finally: self.process = None
|
||||
elif self.process: console.print("[dim]HTTP server process already stopped.[/dim]"); self.process = None
|
||||
|
||||
def is_running(self) -> bool:
|
||||
if not self.process: return False
|
||||
return self.process.poll() is None
|
||||
|
||||
# --- SimpleMemoryTracker Class (Unchanged) ---
|
||||
class SimpleMemoryTracker:
|
||||
"""Basic memory tracker that doesn't rely on psutil."""
|
||||
def __init__(self, report_path: str = DEFAULT_REPORT_PATH, test_id: Optional[str] = None):
|
||||
self.report_path = pathlib.Path(report_path); self.report_path.mkdir(parents=True, exist_ok=True)
|
||||
self.test_id = test_id or time.strftime("%Y%m%d_%H%M%S")
|
||||
self.start_time = time.time(); self.memory_samples = []; self.pid = os.getpid()
|
||||
self.csv_path = self.report_path / f"memory_samples_{self.test_id}.csv"
|
||||
with open(self.csv_path, 'w', encoding='utf-8') as f: f.write("timestamp,elapsed_seconds,memory_info_mb\n")
|
||||
|
||||
def sample(self) -> Dict:
|
||||
try:
|
||||
memory_mb = self._get_memory_info_mb()
|
||||
memory_str = f"{memory_mb:.1f} MB" if memory_mb is not None else "Unknown"
|
||||
timestamp = time.time(); elapsed = timestamp - self.start_time
|
||||
sample = {"timestamp": timestamp, "elapsed_seconds": elapsed, "memory_mb": memory_mb, "memory_str": memory_str}
|
||||
self.memory_samples.append(sample)
|
||||
with open(self.csv_path, 'a', encoding='utf-8') as f: f.write(f"{timestamp},{elapsed:.2f},{memory_mb if memory_mb is not None else ''}\n")
|
||||
return sample
|
||||
except Exception as e: return {"memory_mb": None, "memory_str": "Error"}
|
||||
|
||||
def _get_memory_info_mb(self) -> Optional[float]:
|
||||
pid_str = str(self.pid)
|
||||
try:
|
||||
if sys.platform == 'darwin': result = subprocess.run(["ps", "-o", "rss=", "-p", pid_str], capture_output=True, text=True, check=True, encoding='utf-8'); return int(result.stdout.strip()) / 1024.0
|
||||
elif sys.platform == 'linux':
|
||||
with open(f"/proc/{pid_str}/status", encoding='utf-8') as f:
|
||||
for line in f:
|
||||
if line.startswith("VmRSS:"): return int(line.split()[1]) / 1024.0
|
||||
return None
|
||||
elif sys.platform == 'win32': result = subprocess.run(["tasklist", "/fi", f"PID eq {pid_str}", "/fo", "csv", "/nh"], capture_output=True, text=True, check=True, encoding='cp850', errors='ignore'); parts = result.stdout.strip().split('","'); return int(parts[4].strip().replace('"', '').replace(' K', '').replace(',', '')) / 1024.0 if len(parts) >= 5 else None
|
||||
else: return None
|
||||
except: return None # Catch all exceptions for robustness
|
||||
|
||||
def get_report(self) -> Dict:
|
||||
if not self.memory_samples: return {"error": "No memory samples collected"}
|
||||
total_time = time.time() - self.start_time; valid_samples = [s['memory_mb'] for s in self.memory_samples if s['memory_mb'] is not None]
|
||||
start_mem = valid_samples[0] if valid_samples else None; end_mem = valid_samples[-1] if valid_samples else None
|
||||
max_mem = max(valid_samples) if valid_samples else None; avg_mem = sum(valid_samples) / len(valid_samples) if valid_samples else None
|
||||
growth = (end_mem - start_mem) if start_mem is not None and end_mem is not None else None
|
||||
return {"test_id": self.test_id, "total_time_seconds": total_time, "sample_count": len(self.memory_samples), "valid_sample_count": len(valid_samples), "csv_path": str(self.csv_path), "platform": sys.platform, "start_memory_mb": start_mem, "end_memory_mb": end_mem, "max_memory_mb": max_mem, "average_memory_mb": avg_mem, "memory_growth_mb": growth}
|
||||
|
||||
|
||||
# --- CrawlerStressTest Class (Refactored for Per-Batch Logging) ---
|
||||
class CrawlerStressTest:
|
||||
"""Orchestrates the stress test using arun_many per chunk and a dispatcher."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url_count: int = DEFAULT_URL_COUNT,
|
||||
port: int = DEFAULT_PORT,
|
||||
max_sessions: int = DEFAULT_MAX_SESSIONS,
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE, # Added chunk_size
|
||||
report_path: str = DEFAULT_REPORT_PATH,
|
||||
stream_mode: bool = DEFAULT_STREAM_MODE,
|
||||
monitor_mode: str = DEFAULT_MONITOR_MODE,
|
||||
use_rate_limiter: bool = False
|
||||
):
|
||||
self.url_count = url_count
|
||||
self.server_port = port
|
||||
self.max_sessions = max_sessions
|
||||
self.chunk_size = chunk_size # Store chunk size
|
||||
self.report_path = pathlib.Path(report_path)
|
||||
self.report_path.mkdir(parents=True, exist_ok=True)
|
||||
self.stream_mode = stream_mode
|
||||
self.monitor_mode = DisplayMode[monitor_mode.upper()]
|
||||
self.use_rate_limiter = use_rate_limiter
|
||||
|
||||
self.test_id = time.strftime("%Y%m%d_%H%M%S")
|
||||
self.results_summary = {
|
||||
"test_id": self.test_id, "url_count": url_count, "max_sessions": max_sessions,
|
||||
"chunk_size": chunk_size, "stream_mode": stream_mode, "monitor_mode": monitor_mode,
|
||||
"rate_limiter_used": use_rate_limiter, "start_time": "", "end_time": "",
|
||||
"total_time_seconds": 0, "successful_urls": 0, "failed_urls": 0,
|
||||
"urls_processed": 0, "chunks_processed": 0
|
||||
}
|
||||
|
||||
async def run(self) -> Dict:
|
||||
"""Run the stress test and return results."""
|
||||
memory_tracker = SimpleMemoryTracker(report_path=self.report_path, test_id=self.test_id)
|
||||
urls = [f"http://localhost:{self.server_port}/page_{i}.html" for i in range(self.url_count)]
|
||||
# Split URLs into chunks based on self.chunk_size
|
||||
url_chunks = [urls[i:i+self.chunk_size] for i in range(0, len(urls), self.chunk_size)]
|
||||
|
||||
self.results_summary["start_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
start_time = time.time()
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
wait_for_images=False, verbose=False,
|
||||
stream=self.stream_mode, # Still pass stream mode, affects arun_many return type
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
total_successful_urls = 0
|
||||
total_failed_urls = 0
|
||||
total_urls_processed = 0
|
||||
start_memory_sample = memory_tracker.sample()
|
||||
start_memory_str = start_memory_sample.get("memory_str", "Unknown")
|
||||
|
||||
# monitor = CrawlerMonitor(display_mode=self.monitor_mode, total_urls=self.url_count)
|
||||
monitor = None
|
||||
rate_limiter = RateLimiter(base_delay=(0.1, 0.3)) if self.use_rate_limiter else None
|
||||
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=self.max_sessions, monitor=monitor, rate_limiter=rate_limiter)
|
||||
|
||||
console.print(f"\n[bold cyan]Crawl4AI Stress Test - {self.url_count} URLs, {self.max_sessions} max sessions[/bold cyan]")
|
||||
console.print(f"[bold cyan]Mode:[/bold cyan] {'Streaming' if self.stream_mode else 'Batch'}, [bold cyan]Monitor:[/bold cyan] {self.monitor_mode.name}, [bold cyan]Chunk Size:[/bold cyan] {self.chunk_size}")
|
||||
console.print(f"[bold cyan]Initial Memory:[/bold cyan] {start_memory_str}")
|
||||
|
||||
# Print batch log header only if not streaming
|
||||
if not self.stream_mode:
|
||||
console.print("\n[bold]Batch Progress:[/bold] (Monitor below shows overall progress)")
|
||||
console.print("[bold] Batch | Progress | Start Mem | End Mem | URLs/sec | Success/Fail | Time (s) | Status [/bold]")
|
||||
console.print("─" * 90)
|
||||
|
||||
monitor_task = asyncio.create_task(self._periodic_memory_sample(memory_tracker, 2.0))
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(
|
||||
config=BrowserConfig( verbose = False)
|
||||
) as crawler:
|
||||
# Process URLs chunk by chunk
|
||||
for chunk_idx, url_chunk in enumerate(url_chunks):
|
||||
batch_start_time = time.time()
|
||||
chunk_success = 0
|
||||
chunk_failed = 0
|
||||
|
||||
# Sample memory before the chunk
|
||||
start_mem_sample = memory_tracker.sample()
|
||||
start_mem_str = start_mem_sample.get("memory_str", "Unknown")
|
||||
|
||||
# --- Call arun_many for the current chunk ---
|
||||
try:
|
||||
# Note: dispatcher/monitor persist across calls
|
||||
results_gen_or_list: Union[AsyncGenerator[CrawlResult, None], List[CrawlResult]] = \
|
||||
await crawler.arun_many(
|
||||
urls=url_chunk,
|
||||
config=config,
|
||||
dispatcher=dispatcher # Reuse the same dispatcher
|
||||
)
|
||||
|
||||
if self.stream_mode:
|
||||
# Process stream results if needed, but batch logging is less relevant
|
||||
async for result in results_gen_or_list:
|
||||
total_urls_processed += 1
|
||||
if result.success: chunk_success += 1
|
||||
else: chunk_failed += 1
|
||||
# In stream mode, batch summary isn't as meaningful here
|
||||
# We could potentially track completion per chunk async, but it's complex
|
||||
|
||||
else: # Batch mode
|
||||
# Process the list of results for this chunk
|
||||
for result in results_gen_or_list:
|
||||
total_urls_processed += 1
|
||||
if result.success: chunk_success += 1
|
||||
else: chunk_failed += 1
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Error processing chunk {chunk_idx+1}: {e}[/bold red]")
|
||||
chunk_failed = len(url_chunk) # Assume all failed in the chunk on error
|
||||
total_urls_processed += len(url_chunk) # Count them as processed (failed)
|
||||
|
||||
# --- Log batch results (only if not streaming) ---
|
||||
if not self.stream_mode:
|
||||
batch_time = time.time() - batch_start_time
|
||||
urls_per_sec = len(url_chunk) / batch_time if batch_time > 0 else 0
|
||||
end_mem_sample = memory_tracker.sample()
|
||||
end_mem_str = end_mem_sample.get("memory_str", "Unknown")
|
||||
|
||||
progress_pct = (total_urls_processed / self.url_count) * 100
|
||||
|
||||
if chunk_failed == 0: status_color, status = "green", "Success"
|
||||
elif chunk_success == 0: status_color, status = "red", "Failed"
|
||||
else: status_color, status = "yellow", "Partial"
|
||||
|
||||
console.print(
|
||||
f" {chunk_idx+1:<5} | {progress_pct:6.1f}% | {start_mem_str:>9} | {end_mem_str:>9} | {urls_per_sec:8.1f} | "
|
||||
f"{chunk_success:^7}/{chunk_failed:<6} | {batch_time:8.2f} | [{status_color}]{status:<7}[/{status_color}]"
|
||||
)
|
||||
|
||||
# Accumulate totals
|
||||
total_successful_urls += chunk_success
|
||||
total_failed_urls += chunk_failed
|
||||
self.results_summary["chunks_processed"] += 1
|
||||
|
||||
# Optional small delay between starting chunks if needed
|
||||
# await asyncio.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]An error occurred during the main crawl loop: {e}[/bold red]")
|
||||
finally:
|
||||
if 'monitor_task' in locals() and not monitor_task.done():
|
||||
monitor_task.cancel()
|
||||
try: await monitor_task
|
||||
except asyncio.CancelledError: pass
|
||||
|
||||
end_time = time.time()
|
||||
self.results_summary.update({
|
||||
"end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"total_time_seconds": end_time - start_time,
|
||||
"successful_urls": total_successful_urls,
|
||||
"failed_urls": total_failed_urls,
|
||||
"urls_processed": total_urls_processed,
|
||||
"memory": memory_tracker.get_report()
|
||||
})
|
||||
self._save_results()
|
||||
return self.results_summary
|
||||
|
||||
async def _periodic_memory_sample(self, tracker: SimpleMemoryTracker, interval: float):
|
||||
"""Background task to sample memory periodically."""
|
||||
while True:
|
||||
tracker.sample()
|
||||
try:
|
||||
await asyncio.sleep(interval)
|
||||
except asyncio.CancelledError:
|
||||
break # Exit loop on cancellation
|
||||
|
||||
def _save_results(self) -> None:
|
||||
results_path = self.report_path / f"test_summary_{self.test_id}.json"
|
||||
try:
|
||||
with open(results_path, 'w', encoding='utf-8') as f: json.dump(self.results_summary, f, indent=2, default=str)
|
||||
# console.print(f"\n[bold green]Results summary saved to {results_path}[/bold green]") # Moved summary print to run_full_test
|
||||
except Exception as e: console.print(f"[bold red]Failed to save results summary: {e}[/bold red]")
|
||||
|
||||
|
||||
# --- run_full_test Function (Adjusted) ---
|
||||
async def run_full_test(args):
|
||||
"""Run the complete test process from site generation to crawling."""
|
||||
server = None
|
||||
site_generated = False
|
||||
|
||||
# --- Site Generation --- (Same as before)
|
||||
if not args.use_existing_site and not args.skip_generation:
|
||||
if os.path.exists(args.site_path): console.print(f"[yellow]Removing existing site directory: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
|
||||
site_generator = SiteGenerator(site_path=args.site_path, page_count=args.urls); site_generator.generate_site(); site_generated = True
|
||||
elif args.use_existing_site: console.print(f"[cyan]Using existing site assumed to be running on port {args.port}[/cyan]")
|
||||
elif args.skip_generation:
|
||||
console.print(f"[cyan]Skipping site generation, using existing directory: {args.site_path}[/cyan]")
|
||||
if not os.path.exists(args.site_path) or not os.path.isdir(args.site_path): console.print(f"[bold red]Error: Site path '{args.site_path}' does not exist or is not a directory.[/bold red]"); return
|
||||
|
||||
# --- Start Local Server --- (Same as before)
|
||||
server_started = False
|
||||
if not args.use_existing_site:
|
||||
server = LocalHttpServer(site_path=args.site_path, port=args.port)
|
||||
try: server.start(); server_started = True
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Failed to start local server. Aborting test.[/bold red]")
|
||||
if site_generated and not args.keep_site: console.print(f"[yellow]Cleaning up generated site: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
|
||||
return
|
||||
|
||||
try:
|
||||
# --- Run the Stress Test ---
|
||||
test = CrawlerStressTest(
|
||||
url_count=args.urls,
|
||||
port=args.port,
|
||||
max_sessions=args.max_sessions,
|
||||
chunk_size=args.chunk_size, # Pass chunk_size
|
||||
report_path=args.report_path,
|
||||
stream_mode=args.stream,
|
||||
monitor_mode=args.monitor_mode,
|
||||
use_rate_limiter=args.use_rate_limiter
|
||||
)
|
||||
results = await test.run() # Run the test which now handles chunks internally
|
||||
|
||||
# --- Print Summary ---
|
||||
console.print("\n" + "=" * 80)
|
||||
console.print("[bold green]Test Completed[/bold green]")
|
||||
console.print("=" * 80)
|
||||
|
||||
# (Summary printing logic remains largely the same)
|
||||
success_rate = results["successful_urls"] / results["url_count"] * 100 if results["url_count"] > 0 else 0
|
||||
urls_per_second = results["urls_processed"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
|
||||
|
||||
console.print(f"[bold cyan]Test ID:[/bold cyan] {results['test_id']}")
|
||||
console.print(f"[bold cyan]Configuration:[/bold cyan] {results['url_count']} URLs, {results['max_sessions']} sessions, Chunk: {results['chunk_size']}, Stream: {results['stream_mode']}, Monitor: {results['monitor_mode']}")
|
||||
console.print(f"[bold cyan]Results:[/bold cyan] {results['successful_urls']} successful, {results['failed_urls']} failed ({results['urls_processed']} processed, {success_rate:.1f}% success)")
|
||||
console.print(f"[bold cyan]Performance:[/bold cyan] {results['total_time_seconds']:.2f} seconds total, {urls_per_second:.2f} URLs/second avg")
|
||||
|
||||
mem_report = results.get("memory", {})
|
||||
mem_info_str = "Memory tracking data unavailable."
|
||||
if mem_report and not mem_report.get("error"):
|
||||
start_mb = mem_report.get('start_memory_mb'); end_mb = mem_report.get('end_memory_mb'); max_mb = mem_report.get('max_memory_mb'); growth_mb = mem_report.get('memory_growth_mb')
|
||||
mem_parts = []
|
||||
if start_mb is not None: mem_parts.append(f"Start: {start_mb:.1f} MB")
|
||||
if end_mb is not None: mem_parts.append(f"End: {end_mb:.1f} MB")
|
||||
if max_mb is not None: mem_parts.append(f"Max: {max_mb:.1f} MB")
|
||||
if growth_mb is not None: mem_parts.append(f"Growth: {growth_mb:.1f} MB")
|
||||
if mem_parts: mem_info_str = ", ".join(mem_parts)
|
||||
csv_path = mem_report.get('csv_path')
|
||||
if csv_path: console.print(f"[dim]Memory samples saved to: {csv_path}[/dim]")
|
||||
|
||||
console.print(f"[bold cyan]Memory Usage:[/bold cyan] {mem_info_str}")
|
||||
console.print(f"[bold green]Results summary saved to {results['memory']['csv_path'].replace('memory_samples', 'test_summary').replace('.csv', '.json')}[/bold green]") # Infer summary path
|
||||
|
||||
|
||||
if results["failed_urls"] > 0: console.print(f"\n[bold yellow]Warning: {results['failed_urls']} URLs failed to process ({100-success_rate:.1f}% failure rate)[/bold yellow]")
|
||||
if results["urls_processed"] < results["url_count"]: console.print(f"\n[bold red]Error: Only {results['urls_processed']} out of {results['url_count']} URLs were processed![/bold red]")
|
||||
|
||||
|
||||
finally:
|
||||
# --- Stop Server / Cleanup --- (Same as before)
|
||||
if server_started and server and not args.keep_server_alive: server.stop()
|
||||
elif server_started and server and args.keep_server_alive:
|
||||
console.print(f"[bold cyan]Server is kept running on port {args.port}. Press Ctrl+C to stop it.[/bold cyan]")
|
||||
try: await asyncio.Future() # Keep running indefinitely
|
||||
except KeyboardInterrupt: console.print("\n[bold yellow]Stopping server due to user interrupt...[/bold yellow]"); server.stop()
|
||||
|
||||
if site_generated and not args.keep_site: console.print(f"[yellow]Cleaning up generated site: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
|
||||
elif args.clean_site and os.path.exists(args.site_path): console.print(f"[yellow]Cleaning up site directory as requested: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
|
||||
|
||||
|
||||
# --- main Function (Added chunk_size argument) ---
|
||||
def main():
|
||||
"""Main entry point for the script."""
|
||||
parser = argparse.ArgumentParser(description="Crawl4AI SDK High Volume Stress Test using arun_many")
|
||||
|
||||
# Test parameters
|
||||
parser.add_argument("--urls", type=int, default=DEFAULT_URL_COUNT, help=f"Number of URLs to test (default: {DEFAULT_URL_COUNT})")
|
||||
parser.add_argument("--max-sessions", type=int, default=DEFAULT_MAX_SESSIONS, help=f"Maximum concurrent crawling sessions (default: {DEFAULT_MAX_SESSIONS})")
|
||||
parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help=f"Number of URLs per batch for logging (default: {DEFAULT_CHUNK_SIZE})") # Added
|
||||
parser.add_argument("--stream", action="store_true", default=DEFAULT_STREAM_MODE, help=f"Enable streaming mode (disables batch logging) (default: {DEFAULT_STREAM_MODE})")
|
||||
parser.add_argument("--monitor-mode", type=str, default=DEFAULT_MONITOR_MODE, choices=["DETAILED", "AGGREGATED"], help=f"Display mode for the live monitor (default: {DEFAULT_MONITOR_MODE})")
|
||||
parser.add_argument("--use-rate-limiter", action="store_true", default=False, help="Enable a basic rate limiter (default: False)")
|
||||
|
||||
# Environment parameters
|
||||
parser.add_argument("--site-path", type=str, default=DEFAULT_SITE_PATH, help=f"Path to generate/use the test site (default: {DEFAULT_SITE_PATH})")
|
||||
parser.add_argument("--port", type=int, default=DEFAULT_PORT, help=f"Port for the local HTTP server (default: {DEFAULT_PORT})")
|
||||
parser.add_argument("--report-path", type=str, default=DEFAULT_REPORT_PATH, help=f"Path to save reports and logs (default: {DEFAULT_REPORT_PATH})")
|
||||
|
||||
# Site/Server management
|
||||
parser.add_argument("--skip-generation", action="store_true", help="Use existing test site folder without regenerating")
|
||||
parser.add_argument("--use-existing-site", action="store_true", help="Do not generate site or start local server; assume site exists on --port")
|
||||
parser.add_argument("--keep-server-alive", action="store_true", help="Keep the local HTTP server running after test")
|
||||
parser.add_argument("--keep-site", action="store_true", help="Keep the generated test site files after test")
|
||||
parser.add_argument("--clean-reports", action="store_true", help="Clean up report directory before running")
|
||||
parser.add_argument("--clean-site", action="store_true", help="Clean up site directory before running (if generating) or after")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Display config
|
||||
console.print("[bold underline]Crawl4AI SDK Stress Test Configuration[/bold underline]")
|
||||
console.print(f"URLs: {args.urls}, Max Sessions: {args.max_sessions}, Chunk Size: {args.chunk_size}") # Added chunk size
|
||||
console.print(f"Mode: {'Streaming' if args.stream else 'Batch'}, Monitor: {args.monitor_mode}, Rate Limit: {args.use_rate_limiter}")
|
||||
console.print(f"Site Path: {args.site_path}, Port: {args.port}, Report Path: {args.report_path}")
|
||||
console.print("-" * 40)
|
||||
# (Rest of config display and cleanup logic is the same)
|
||||
if args.use_existing_site: console.print("[cyan]Mode: Using existing external site/server[/cyan]")
|
||||
elif args.skip_generation: console.print("[cyan]Mode: Using existing site files, starting local server[/cyan]")
|
||||
else: console.print("[cyan]Mode: Generating site files, starting local server[/cyan]")
|
||||
if args.keep_server_alive: console.print("[cyan]Option: Keep server alive after test[/cyan]")
|
||||
if args.keep_site: console.print("[cyan]Option: Keep site files after test[/cyan]")
|
||||
if args.clean_reports: console.print("[cyan]Option: Clean reports before test[/cyan]")
|
||||
if args.clean_site: console.print("[cyan]Option: Clean site directory[/cyan]")
|
||||
console.print("-" * 40)
|
||||
|
||||
if args.clean_reports:
|
||||
if os.path.exists(args.report_path): console.print(f"[yellow]Cleaning up reports directory: {args.report_path}[/yellow]"); shutil.rmtree(args.report_path)
|
||||
os.makedirs(args.report_path, exist_ok=True)
|
||||
if args.clean_site and not args.use_existing_site:
|
||||
if os.path.exists(args.site_path): console.print(f"[yellow]Cleaning up site directory as requested: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
|
||||
|
||||
# Run
|
||||
try: asyncio.run(run_full_test(args))
|
||||
except KeyboardInterrupt: console.print("\n[bold yellow]Test interrupted by user.[/bold yellow]")
|
||||
except Exception as e: console.print(f"\n[bold red]An unexpected error occurred:[/bold red] {e}"); import traceback; traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
32
tests/profiler/test_crteate_profile.py
Normal file
32
tests/profiler/test_crteate_profile.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from crawl4ai import BrowserProfiler
|
||||
import asyncio
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
# Create a new profile
|
||||
import os
|
||||
from pathlib import Path
|
||||
home_dir = Path.home()
|
||||
profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
|
||||
|
||||
print(f"Profile created at: {profile_path}")
|
||||
|
||||
|
||||
|
||||
# # Launch a standalone browser
|
||||
# asyncio.run(profiler.launch_standalone_browser())
|
||||
|
||||
# # List profiles
|
||||
# profiles = profiler.list_profiles()
|
||||
# for profile in profiles:
|
||||
# print(f"Profile: {profile['name']}, Path: {profile['path']}")
|
||||
|
||||
# # Delete a profile
|
||||
# success = profiler.delete_profile("my-profile")
|
||||
# if success:
|
||||
# print("Profile deleted successfully")
|
||||
# else:
|
||||
# print("Failed to delete profile")
|
||||
@@ -19,7 +19,7 @@ async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f"Markdown length: {len(result.markdown.raw_markdown)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import unittest, os
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import (
|
||||
RegexChunking,
|
||||
@@ -42,7 +43,7 @@ class TestWebCrawler(unittest.TestCase):
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")
|
||||
llm_config=LLMConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user