refactor(server): migrate to pool-based crawler management

Replace crawler_manager.py with simpler crawler_pool.py implementation:
- Add global page semaphore for hard concurrency cap
- Implement browser pool with idle cleanup
- Add playground UI for testing and stress testing
- Update API handlers to use pooled crawlers
- Enhance logging levels and symbols

BREAKING CHANGE: Removes CrawlerManager class in favor of simpler pool-based approach
This commit is contained in:
UncleCode
2025-04-20 20:14:26 +08:00
parent 16b2318242
commit a58c8000aa
14 changed files with 1447 additions and 1435 deletions

View File

@@ -162,6 +162,9 @@ RUN crawl4ai-doctor
# Copy application code
COPY deploy/docker/* ${APP_HOME}/
# copy the playground + any future static assets
COPY deploy/docker/static ${APP_HOME}/static
# Change ownership of the application directory to the non-root user
RUN chown -R appuser:appuser ${APP_HOME}

View File

@@ -7,11 +7,18 @@ from datetime import datetime
class LogLevel(Enum):
DEFAULT = 0
DEBUG = 1
INFO = 2
SUCCESS = 3
WARNING = 4
ERROR = 5
CRITICAL = 6
ALERT = 7
NOTICE = 8
EXCEPTION = 9
FATAL = 10
@@ -61,6 +68,13 @@ class AsyncLogger(AsyncLoggerBase):
"DEBUG": "",
"INFO": "",
"WARNING": "",
"SUCCESS": "",
"CRITICAL": "",
"ALERT": "",
"NOTICE": "",
"EXCEPTION": "",
"FATAL": "",
"DEFAULT": "",
}
DEFAULT_COLORS = {
@@ -69,6 +83,12 @@ class AsyncLogger(AsyncLoggerBase):
LogLevel.SUCCESS: Fore.GREEN,
LogLevel.WARNING: Fore.YELLOW,
LogLevel.ERROR: Fore.RED,
LogLevel.CRITICAL: Fore.RED + Style.BRIGHT,
LogLevel.ALERT: Fore.RED + Style.BRIGHT,
LogLevel.NOTICE: Fore.BLUE,
LogLevel.EXCEPTION: Fore.RED + Style.BRIGHT,
LogLevel.FATAL: Fore.RED + Style.BRIGHT,
LogLevel.DEFAULT: Fore.WHITE,
}
def __init__(
@@ -212,6 +232,22 @@ class AsyncLogger(AsyncLoggerBase):
def warning(self, message: str, tag: str = "WARNING", **kwargs):
"""Log a warning message."""
self._log(LogLevel.WARNING, message, tag, **kwargs)
def critical(self, message: str, tag: str = "CRITICAL", **kwargs):
"""Log a critical message."""
self._log(LogLevel.ERROR, message, tag, **kwargs)
def exception(self, message: str, tag: str = "EXCEPTION", **kwargs):
"""Log an exception message."""
self._log(LogLevel.ERROR, message, tag, **kwargs)
def fatal(self, message: str, tag: str = "FATAL", **kwargs):
"""Log a fatal message."""
self._log(LogLevel.ERROR, message, tag, **kwargs)
def alert(self, message: str, tag: str = "ALERT", **kwargs):
"""Log an alert message."""
self._log(LogLevel.ERROR, message, tag, **kwargs)
def notice(self, message: str, tag: str = "NOTICE", **kwargs):
"""Log a notice message."""
self._log(LogLevel.INFO, message, tag, **kwargs)
def error(self, message: str, tag: str = "ERROR", **kwargs):
"""Log an error message."""

View File

@@ -572,6 +572,9 @@ class BrowserManager:
if self.config.extra_args:
args.extend(self.config.extra_args)
# Deduplicate args
args = list(dict.fromkeys(args))
browser_args = {"headless": self.config.headless, "args": args}
if self.config.chrome_channel:

View File

@@ -1,503 +0,0 @@
import os
import json
import asyncio
from typing import List, Tuple
from functools import partial
import logging
from typing import Optional, AsyncGenerator
from urllib.parse import unquote
from fastapi import HTTPException, Request, status
from fastapi.background import BackgroundTasks
from fastapi.responses import JSONResponse
from redis import asyncio as aioredis
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
LLMExtractionStrategy,
CacheMode,
BrowserConfig,
MemoryAdaptiveDispatcher,
RateLimiter,
LLMConfig
)
from crawl4ai.utils import perform_completion_with_backoff
from crawl4ai.content_filter_strategy import (
PruningContentFilter,
BM25ContentFilter,
LLMContentFilter
)
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from utils import (
TaskStatus,
FilterType,
get_base_url,
is_task_id,
should_cleanup_task,
decode_redis_hash
)
import psutil, time
logger = logging.getLogger(__name__)
# --- Helper to get memory ---
def _get_memory_mb():
try:
return psutil.Process().memory_info().rss / (1024 * 1024)
except Exception as e:
logger.warning(f"Could not get memory info: {e}")
return None
async def handle_llm_qa(
url: str,
query: str,
config: dict
) -> str:
"""Process QA using LLM with crawled content as context."""
try:
# Extract base URL by finding last '?q=' occurrence
last_q_index = url.rfind('?q=')
if last_q_index != -1:
url = url[:last_q_index]
# Get markdown content
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url)
if not result.success:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=result.error_message
)
content = result.markdown.fit_markdown
# Create prompt and get LLM response
prompt = f"""Use the following content as context to answer the question.
Content:
{content}
Question: {query}
Answer:"""
response = perform_completion_with_backoff(
provider=config["llm"]["provider"],
prompt_with_variables=prompt,
api_token=os.environ.get(config["llm"].get("api_key_env", ""))
)
return response.choices[0].message.content
except Exception as e:
logger.error(f"QA processing error: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
async def process_llm_extraction(
redis: aioredis.Redis,
config: dict,
task_id: str,
url: str,
instruction: str,
schema: Optional[str] = None,
cache: str = "0"
) -> None:
"""Process LLM extraction in background."""
try:
# If config['llm'] has api_key then ignore the api_key_env
api_key = ""
if "api_key" in config["llm"]:
api_key = config["llm"]["api_key"]
else:
api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
llm_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(
provider=config["llm"]["provider"],
api_token=api_key
),
instruction=instruction,
schema=json.loads(schema) if schema else None,
)
cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
config=CrawlerRunConfig(
extraction_strategy=llm_strategy,
scraping_strategy=LXMLWebScrapingStrategy(),
cache_mode=cache_mode
)
)
if not result.success:
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.FAILED,
"error": result.error_message
})
return
try:
content = json.loads(result.extracted_content)
except json.JSONDecodeError:
content = result.extracted_content
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.COMPLETED,
"result": json.dumps(content)
})
except Exception as e:
logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.FAILED,
"error": str(e)
})
async def handle_markdown_request(
url: str,
filter_type: FilterType,
query: Optional[str] = None,
cache: str = "0",
config: Optional[dict] = None
) -> str:
"""Handle markdown generation requests."""
try:
decoded_url = unquote(url)
if not decoded_url.startswith(('http://', 'https://')):
decoded_url = 'https://' + decoded_url
if filter_type == FilterType.RAW:
md_generator = DefaultMarkdownGenerator()
else:
content_filter = {
FilterType.FIT: PruningContentFilter(),
FilterType.BM25: BM25ContentFilter(user_query=query or ""),
FilterType.LLM: LLMContentFilter(
llm_config=LLMConfig(
provider=config["llm"]["provider"],
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
),
instruction=query or "Extract main content"
)
}[filter_type]
md_generator = DefaultMarkdownGenerator(content_filter=content_filter)
cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=decoded_url,
config=CrawlerRunConfig(
markdown_generator=md_generator,
scraping_strategy=LXMLWebScrapingStrategy(),
cache_mode=cache_mode
)
)
if not result.success:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=result.error_message
)
return (result.markdown.raw_markdown
if filter_type == FilterType.RAW
else result.markdown.fit_markdown)
except Exception as e:
logger.error(f"Markdown error: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
async def handle_llm_request(
redis: aioredis.Redis,
background_tasks: BackgroundTasks,
request: Request,
input_path: str,
query: Optional[str] = None,
schema: Optional[str] = None,
cache: str = "0",
config: Optional[dict] = None
) -> JSONResponse:
"""Handle LLM extraction requests."""
base_url = get_base_url(request)
try:
if is_task_id(input_path):
return await handle_task_status(
redis, input_path, base_url
)
if not query:
return JSONResponse({
"message": "Please provide an instruction",
"_links": {
"example": {
"href": f"{base_url}/llm/{input_path}?q=Extract+main+content",
"title": "Try this example"
}
}
})
return await create_new_task(
redis,
background_tasks,
input_path,
query,
schema,
cache,
base_url,
config
)
except Exception as e:
logger.error(f"LLM endpoint error: {str(e)}", exc_info=True)
return JSONResponse({
"error": str(e),
"_links": {
"retry": {"href": str(request.url)}
}
}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
async def handle_task_status(
redis: aioredis.Redis,
task_id: str,
base_url: str
) -> JSONResponse:
"""Handle task status check requests."""
task = await redis.hgetall(f"task:{task_id}")
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
task = decode_redis_hash(task)
response = create_task_response(task, task_id, base_url)
if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]:
if should_cleanup_task(task["created_at"]):
await redis.delete(f"task:{task_id}")
return JSONResponse(response)
async def create_new_task(
redis: aioredis.Redis,
background_tasks: BackgroundTasks,
input_path: str,
query: str,
schema: Optional[str],
cache: str,
base_url: str,
config: dict
) -> JSONResponse:
"""Create and initialize a new task."""
decoded_url = unquote(input_path)
if not decoded_url.startswith(('http://', 'https://')):
decoded_url = 'https://' + decoded_url
from datetime import datetime
task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.PROCESSING,
"created_at": datetime.now().isoformat(),
"url": decoded_url
})
background_tasks.add_task(
process_llm_extraction,
redis,
config,
task_id,
decoded_url,
query,
schema,
cache
)
return JSONResponse({
"task_id": task_id,
"status": TaskStatus.PROCESSING,
"url": decoded_url,
"_links": {
"self": {"href": f"{base_url}/llm/{task_id}"},
"status": {"href": f"{base_url}/llm/{task_id}"}
}
})
def create_task_response(task: dict, task_id: str, base_url: str) -> dict:
"""Create response for task status check."""
response = {
"task_id": task_id,
"status": task["status"],
"created_at": task["created_at"],
"url": task["url"],
"_links": {
"self": {"href": f"{base_url}/llm/{task_id}"},
"refresh": {"href": f"{base_url}/llm/{task_id}"}
}
}
if task["status"] == TaskStatus.COMPLETED:
response["result"] = json.loads(task["result"])
elif task["status"] == TaskStatus.FAILED:
response["error"] = task["error"]
return response
async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
"""Stream results with heartbeats and completion markers."""
import json
from utils import datetime_handler
try:
async for result in results_gen:
try:
server_memory_mb = _get_memory_mb()
result_dict = result.model_dump()
result_dict['server_memory_mb'] = server_memory_mb
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
data = json.dumps(result_dict, default=datetime_handler) + "\n"
yield data.encode('utf-8')
except Exception as e:
logger.error(f"Serialization error: {e}")
error_response = {"error": str(e), "url": getattr(result, 'url', 'unknown')}
yield (json.dumps(error_response) + "\n").encode('utf-8')
yield json.dumps({"status": "completed"}).encode('utf-8')
except asyncio.CancelledError:
logger.warning("Client disconnected during streaming")
finally:
try:
await crawler.close()
except Exception as e:
logger.error(f"Crawler cleanup error: {e}")
async def handle_crawl_request(
urls: List[str],
browser_config: dict,
crawler_config: dict,
config: dict
) -> dict:
"""Handle non-streaming crawl requests."""
start_mem_mb = _get_memory_mb() # <--- Get memory before
start_time = time.time()
mem_delta_mb = None
peak_mem_mb = start_mem_mb
try:
browser_config = BrowserConfig.load(browser_config)
crawler_config = CrawlerRunConfig.load(crawler_config)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
rate_limiter=RateLimiter(
base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
)
)
crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
results = []
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
partial_func = partial(func,
urls[0] if len(urls) == 1 else urls,
config=crawler_config,
dispatcher=dispatcher)
results = await partial_func()
await crawler.close()
end_mem_mb = _get_memory_mb() # <--- Get memory after
end_time = time.time()
if start_mem_mb is not None and end_mem_mb is not None:
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
return {
"success": True,
"results": [result.model_dump() for result in results],
"server_processing_time_s": end_time - start_time,
"server_memory_delta_mb": mem_delta_mb,
"server_peak_memory_mb": peak_mem_mb
}
except Exception as e:
logger.error(f"Crawl error: {str(e)}", exc_info=True)
if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
try:
await crawler.close()
except Exception as close_e:
logger.error(f"Error closing crawler during exception handling: {close_e}")
# Measure memory even on error if possible
end_mem_mb_error = _get_memory_mb()
if start_mem_mb is not None and end_mem_mb_error is not None:
mem_delta_mb = end_mem_mb_error - start_mem_mb
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=json.dumps({ # Send structured error
"error": str(e),
"server_memory_delta_mb": mem_delta_mb,
"server_peak_memory_mb": max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb_error or 0)
})
)
async def handle_stream_crawl_request(
urls: List[str],
browser_config: dict,
crawler_config: dict,
config: dict
) -> Tuple[AsyncWebCrawler, AsyncGenerator]:
"""Handle streaming crawl requests."""
try:
browser_config = BrowserConfig.load(browser_config)
# browser_config.verbose = True # Set to False or remove for production stress testing
browser_config.verbose = False
crawler_config = CrawlerRunConfig.load(crawler_config)
crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
crawler_config.stream = True
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
rate_limiter=RateLimiter(
base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
)
)
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
results_gen = await crawler.arun_many(
urls=urls,
config=crawler_config,
dispatcher=dispatcher
)
return crawler, results_gen
except Exception as e:
# Make sure to close crawler if started during an error here
if 'crawler' in locals() and crawler.ready:
try:
await crawler.close()
except Exception as close_e:
logger.error(f"Error closing crawler during stream setup exception: {close_e}")
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
# Raising HTTPException here will prevent streaming response
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)

View File

@@ -377,14 +377,14 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
except asyncio.CancelledError:
logger.warning("Client disconnected during streaming")
# finally:
# try:
# await crawler.close()
# except Exception as e:
# logger.error(f"Crawler cleanup error: {e}")
finally:
# try:
# await crawler.close()
# except Exception as e:
# logger.error(f"Crawler cleanup error: {e}")
pass
async def handle_crawl_request(
crawler: AsyncWebCrawler,
urls: List[str],
browser_config: dict,
crawler_config: dict,
@@ -404,24 +404,29 @@ async def handle_crawl_request(
memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
rate_limiter=RateLimiter(
base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
)
) if config["crawler"]["rate_limiter"]["enabled"] else None
)
from crawler_pool import get_crawler
crawler = await get_crawler(browser_config)
# crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()
base_config = config["crawler"]["base_config"]
# Iterate on key-value pairs in global_config then use haseattr to set them
for key, value in base_config.items():
if hasattr(crawler_config, key):
setattr(crawler_config, key, value)
results = []
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
partial_func = partial(func,
urls[0] if len(urls) == 1 else urls,
config=crawler_config,
dispatcher=dispatcher)
# Simulate work being done by the crawler
# logger.debug(f"Request (URLs: {len(urls)}) starting simulated work...") # Add log
# await asyncio.sleep(2) # <--- ADD ARTIFICIAL DELAY (e.g., 0.5 seconds)
# logger.debug(f"Request (URLs: {len(urls)}) finished simulated work.")
results = await partial_func()
# await crawler.close()
end_mem_mb = _get_memory_mb() # <--- Get memory after
@@ -442,11 +447,12 @@ async def handle_crawl_request(
except Exception as e:
logger.error(f"Crawl error: {str(e)}", exc_info=True)
# if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
# try:
# await crawler.close()
# except Exception as close_e:
# logger.error(f"Error closing crawler during exception handling: {close_e}")
if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
# try:
# await crawler.close()
# except Exception as close_e:
# logger.error(f"Error closing crawler during exception handling: {close_e}")
logger.error(f"Error closing crawler during exception handling: {close_e}")
# Measure memory even on error if possible
end_mem_mb_error = _get_memory_mb()
@@ -463,7 +469,6 @@ async def handle_crawl_request(
)
async def handle_stream_crawl_request(
crawler: AsyncWebCrawler,
urls: List[str],
browser_config: dict,
crawler_config: dict,
@@ -485,6 +490,9 @@ async def handle_stream_crawl_request(
)
)
from crawler_pool import get_crawler
crawler = await get_crawler(browser_config)
# crawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()
@@ -494,17 +502,16 @@ async def handle_stream_crawl_request(
dispatcher=dispatcher
)
# Return the *same* crawler instance and the generator
# The caller (server.py) manages the crawler lifecycle via the pool context
return crawler, results_gen
except Exception as e:
# Make sure to close crawler if started during an error here
# if 'crawler' in locals() and crawler.ready:
# try:
# await crawler.close()
# except Exception as close_e:
# logger.error(f"Error closing crawler during stream setup exception: {close_e}")
if 'crawler' in locals() and crawler.ready:
# try:
# await crawler.close()
# except Exception as close_e:
# logger.error(f"Error closing crawler during stream setup exception: {close_e}")
logger.error(f"Error closing crawler during stream setup exception: {close_e}")
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
# Raising HTTPException here will prevent streaming response
raise HTTPException(

View File

@@ -5,6 +5,7 @@ app:
host: "0.0.0.0"
port: 8020
reload: False
workers: 4
timeout_keep_alive: 300
# Default LLM Configuration
@@ -48,53 +49,38 @@ security:
content_security_policy: "default-src 'self'"
strict_transport_security: "max-age=63072000; includeSubDomains"
# Crawler Pool Configuration
crawler_pool:
enabled: true # Set to false to disable the pool
# --- Option 1: Auto-calculate size ---
auto_calculate_size: true
calculation_params:
mem_headroom_mb: 512 # Memory reserved for OS/other apps
avg_page_mem_mb: 150 # Estimated MB per concurrent "tab"/page in browsers
fd_per_page: 20 # Estimated file descriptors per page
core_multiplier: 4 # Max crawlers per CPU core
min_pool_size: 2 # Minimum number of primary crawlers
max_pool_size: 16 # Maximum number of primary crawlers
# --- Option 2: Manual size (ignored if auto_calculate_size is true) ---
# pool_size: 8
# --- Other Pool Settings ---
backup_pool_size: 1 # Number of backup crawlers
max_wait_time_s: 30.0 # Max seconds a request waits for a free crawler
throttle_threshold_percent: 70.0 # Start throttling delay above this % usage
throttle_delay_min_s: 0.1 # Min throttle delay
throttle_delay_max_s: 0.5 # Max throttle delay
# --- Browser Config for Pooled Crawlers ---
browser_config:
# No need for "type": "BrowserConfig" here, just params
headless: true
verbose: false # Keep pool crawlers less verbose in production
# user_agent: "MyPooledCrawler/1.0" # Example
# Add other BrowserConfig params as needed (e.g., proxy, viewport)
# Crawler Configuration
crawler:
base_config:
simulate_user: true
memory_threshold_percent: 95.0
rate_limiter:
enabled: true
base_delay: [1.0, 2.0]
timeouts:
stream_init: 30.0 # Timeout for stream initialization
batch_process: 300.0 # Timeout for batch processing
pool:
max_pages: 40 # ← GLOBAL_SEM permits
idle_ttl_sec: 1800 # ← 30 min janitor cutoff
browser:
kwargs:
headless: true
text_mode: true
extra_args:
# - "--single-process"
- "--no-sandbox"
- "--disable-dev-shm-usage"
- "--disable-gpu"
- "--disable-software-rasterizer"
- "--disable-web-security"
- "--allow-insecure-localhost"
- "--ignore-certificate-errors"
# Logging Configuration
logging:
level: "INFO"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
file: "logs/app.log"
verbose: true
# Observability Configuration
observability:
@@ -102,4 +88,4 @@ observability:
enabled: True
endpoint: "/metrics"
health_check:
endpoint: "/health"
endpoint: "/health"

View File

@@ -1,556 +0,0 @@
# crawler_manager.py
import asyncio
import time
import uuid
import psutil
import os
import resource # For FD limit
import random
import math
from typing import Optional, Tuple, Any, List, Dict, AsyncGenerator
from pydantic import BaseModel, Field, field_validator
from contextlib import asynccontextmanager
import logging
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, AsyncLogger
# Assuming api.py handlers are accessible or refactored slightly if needed
# We might need to import the specific handler functions if we call them directly
# from api import handle_crawl_request, handle_stream_crawl_request, _get_memory_mb, stream_results
# --- Custom Exceptions ---
class PoolTimeoutError(Exception):
"""Raised when waiting for a crawler resource times out."""
pass
class PoolConfigurationError(Exception):
"""Raised for configuration issues."""
pass
class NoHealthyCrawlerError(Exception):
"""Raised when no healthy crawler is available."""
pass
# --- Configuration Models ---
class CalculationParams(BaseModel):
mem_headroom_mb: int = 512
avg_page_mem_mb: int = 150
fd_per_page: int = 20
core_multiplier: int = 4
min_pool_size: int = 1 # Min safe pages should be at least 1
max_pool_size: int = 16
# V2 validation for avg_page_mem_mb
@field_validator('avg_page_mem_mb')
@classmethod
def check_avg_page_mem(cls, v: int) -> int:
if v <= 0:
raise ValueError("avg_page_mem_mb must be positive")
return v
# V2 validation for fd_per_page
@field_validator('fd_per_page')
@classmethod
def check_fd_per_page(cls, v: int) -> int:
if v <= 0:
raise ValueError("fd_per_page must be positive")
return v
# crawler_manager.py
# ... (imports including BaseModel, Field from pydantic) ...
from pydantic import BaseModel, Field, field_validator # <-- Import field_validator
# --- Configuration Models (Pydantic V2 Syntax) ---
class CalculationParams(BaseModel):
mem_headroom_mb: int = 512
avg_page_mem_mb: int = 150
fd_per_page: int = 20
core_multiplier: int = 4
min_pool_size: int = 1 # Min safe pages should be at least 1
max_pool_size: int = 16
# V2 validation for avg_page_mem_mb
@field_validator('avg_page_mem_mb')
@classmethod
def check_avg_page_mem(cls, v: int) -> int:
if v <= 0:
raise ValueError("avg_page_mem_mb must be positive")
return v
# V2 validation for fd_per_page
@field_validator('fd_per_page')
@classmethod
def check_fd_per_page(cls, v: int) -> int:
if v <= 0:
raise ValueError("fd_per_page must be positive")
return v
class CrawlerManagerConfig(BaseModel):
enabled: bool = True
auto_calculate_size: bool = True
calculation_params: CalculationParams = Field(default_factory=CalculationParams) # Use Field for default_factory
backup_pool_size: int = Field(1, ge=0) # Allow 0 backups
max_wait_time_s: float = 30.0
throttle_threshold_percent: float = Field(70.0, ge=0, le=100)
throttle_delay_min_s: float = 0.1
throttle_delay_max_s: float = 0.5
browser_config: Dict[str, Any] = Field(default_factory=lambda: {"headless": True, "verbose": False}) # Use Field for default_factory
primary_reload_delay_s: float = 60.0
# --- Crawler Manager ---
class CrawlerManager:
"""Manages shared AsyncWebCrawler instances, concurrency, and failover."""
def __init__(self, config: CrawlerManagerConfig, logger = None):
if not config.enabled:
self.logger.warning("CrawlerManager is disabled by configuration.")
# Set defaults to allow server to run, but manager won't function
self.config = config
self._initialized = False,
return
self.config = config
self._primary_crawler: Optional[AsyncWebCrawler] = None
self._secondary_crawlers: List[AsyncWebCrawler] = []
self._active_crawler_index: int = 0 # 0 for primary, 1+ for secondary index
self._primary_healthy: bool = False
self._secondary_healthy_flags: List[bool] = []
self._safe_pages: int = 1 # Default, calculated in initialize
self._semaphore: Optional[asyncio.Semaphore] = None
self._state_lock = asyncio.Lock() # Protects active_crawler, health flags
self._reload_tasks: List[Optional[asyncio.Task]] = [] # Track reload background tasks
self._initialized = False
self._shutting_down = False
# Initialize logger if provided
if logger is None:
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.INFO)
else:
self.logger = logger
self.logger.info("CrawlerManager initialized with config.")
self.logger.debug(f"Config: {self.config.model_dump_json(indent=2)}")
def is_enabled(self) -> bool:
return self.config.enabled and self._initialized
def _get_system_resources(self) -> Tuple[int, int, int]:
"""Gets RAM, CPU cores, and FD limit."""
total_ram_mb = 0
cpu_cores = 0
try:
mem_info = psutil.virtual_memory()
total_ram_mb = mem_info.total // (1024 * 1024)
cpu_cores = psutil.cpu_count(logical=False) or psutil.cpu_count(logical=True) # Prefer physical cores
except Exception as e:
self.logger.warning(f"Could not get RAM/CPU info via psutil: {e}")
total_ram_mb = 2048 # Default fallback
cpu_cores = 2 # Default fallback
fd_limit = 1024 # Default fallback
try:
soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
fd_limit = soft_limit # Use the soft limit
except (ImportError, ValueError, OSError, AttributeError) as e:
self.logger.warning(f"Could not get file descriptor limit (common on Windows): {e}. Using default: {fd_limit}")
self.logger.info(f"System Resources: RAM={total_ram_mb}MB, Cores={cpu_cores}, FD Limit={fd_limit}")
return total_ram_mb, cpu_cores, fd_limit
def _calculate_safe_pages(self) -> int:
"""Calculates the safe number of concurrent pages based on resources."""
if not self.config.auto_calculate_size:
# If auto-calc is off, use max_pool_size as the hard limit
# This isn't ideal based on the prompt, but provides *some* manual override
# A dedicated `manual_safe_pages` might be better. Let's use max_pool_size for now.
self.logger.warning("Auto-calculation disabled. Using max_pool_size as safe_pages limit.")
return self.config.calculation_params.max_pool_size
params = self.config.calculation_params
total_ram_mb, cpu_cores, fd_limit = self._get_system_resources()
available_ram_mb = total_ram_mb - params.mem_headroom_mb
if available_ram_mb <= 0:
self.logger.error(f"Not enough RAM ({total_ram_mb}MB) after headroom ({params.mem_headroom_mb}MB). Cannot calculate safe pages.")
return params.min_pool_size # Fallback to minimum
try:
# Calculate limits from each resource
mem_limit = available_ram_mb // params.avg_page_mem_mb if params.avg_page_mem_mb > 0 else float('inf')
fd_limit_pages = fd_limit // params.fd_per_page if params.fd_per_page > 0 else float('inf')
cpu_limit = cpu_cores * params.core_multiplier if cpu_cores > 0 else float('inf')
# Determine the most constraining limit
calculated_limit = math.floor(min(mem_limit, fd_limit_pages, cpu_limit))
except ZeroDivisionError:
self.logger.error("Division by zero in safe_pages calculation (avg_page_mem_mb or fd_per_page is zero).")
calculated_limit = params.min_pool_size # Fallback
# Clamp the result within min/max bounds
safe_pages = max(params.min_pool_size, min(calculated_limit, params.max_pool_size))
self.logger.info(f"Calculated safe pages: MemoryLimit={mem_limit}, FDLimit={fd_limit_pages}, CPULimit={cpu_limit} -> RawCalc={calculated_limit} -> Clamped={safe_pages}")
return safe_pages
async def _create_and_start_crawler(self, crawler_id: str) -> Optional[AsyncWebCrawler]:
"""Creates, starts, and returns a crawler instance."""
try:
# Create BrowserConfig from the dictionary in manager config
browser_conf = BrowserConfig(**self.config.browser_config)
crawler = AsyncWebCrawler(config=browser_conf)
await crawler.start()
self.logger.info(f"Successfully started crawler instance: {crawler_id}")
return crawler
except Exception as e:
self.logger.error(f"Failed to start crawler instance {crawler_id}: {e}", exc_info=True)
return None
async def initialize(self):
"""Initializes crawlers and semaphore. Called at server startup."""
if not self.config.enabled or self._initialized:
return
self.logger.info("Initializing CrawlerManager...")
self._safe_pages = self._calculate_safe_pages()
self._semaphore = asyncio.Semaphore(self._safe_pages)
self._primary_crawler = await self._create_and_start_crawler("Primary")
if self._primary_crawler:
self._primary_healthy = True
else:
self._primary_healthy = False
self.logger.critical("Primary crawler failed to initialize!")
self._secondary_crawlers = []
self._secondary_healthy_flags = []
self._reload_tasks = [None] * (1 + self.config.backup_pool_size) # For primary + backups
for i in range(self.config.backup_pool_size):
sec_id = f"Secondary-{i+1}"
crawler = await self._create_and_start_crawler(sec_id)
self._secondary_crawlers.append(crawler) # Add even if None
self._secondary_healthy_flags.append(crawler is not None)
if crawler is None:
self.logger.error(f"{sec_id} crawler failed to initialize!")
# Set initial active crawler (prefer primary)
if self._primary_healthy:
self._active_crawler_index = 0
self.logger.info("Primary crawler is active.")
else:
# Find the first healthy secondary
found_healthy_backup = False
for i, healthy in enumerate(self._secondary_healthy_flags):
if healthy:
self._active_crawler_index = i + 1 # 1-based index for secondaries
self.logger.warning(f"Primary failed, Secondary-{i+1} is active.")
found_healthy_backup = True
break
if not found_healthy_backup:
self.logger.critical("FATAL: No healthy crawlers available after initialization!")
# Server should probably refuse connections in this state
self._initialized = True
self.logger.info(f"CrawlerManager initialized. Safe Pages: {self._safe_pages}. Active Crawler Index: {self._active_crawler_index}")
async def shutdown(self):
"""Shuts down all crawler instances. Called at server shutdown."""
if not self._initialized or self._shutting_down:
return
self._shutting_down = True
self.logger.info("Shutting down CrawlerManager...")
# Cancel any ongoing reload tasks
for i, task in enumerate(self._reload_tasks):
if task and not task.done():
try:
task.cancel()
await task # Wait for cancellation
self.logger.info(f"Cancelled reload task for crawler index {i}.")
except asyncio.CancelledError:
self.logger.info(f"Reload task for crawler index {i} was already cancelled.")
except Exception as e:
self.logger.warning(f"Error cancelling reload task for crawler index {i}: {e}")
self._reload_tasks = []
# Close primary
if self._primary_crawler:
try:
self.logger.info("Closing primary crawler...")
await self._primary_crawler.close()
self._primary_crawler = None
except Exception as e:
self.logger.error(f"Error closing primary crawler: {e}", exc_info=True)
# Close secondaries
for i, crawler in enumerate(self._secondary_crawlers):
if crawler:
try:
self.logger.info(f"Closing secondary crawler {i+1}...")
await crawler.close()
except Exception as e:
self.logger.error(f"Error closing secondary crawler {i+1}: {e}", exc_info=True)
self._secondary_crawlers = []
self._initialized = False
self.logger.info("CrawlerManager shut down complete.")
@asynccontextmanager
async def get_crawler(self) -> AsyncGenerator[AsyncWebCrawler, None]:
"""Acquires semaphore, yields active crawler, handles throttling & failover."""
if not self.is_enabled():
raise NoHealthyCrawlerError("CrawlerManager is disabled or not initialized.")
if self._shutting_down:
raise NoHealthyCrawlerError("CrawlerManager is shutting down.")
active_crawler: Optional[AsyncWebCrawler] = None
acquired = False
request_id = uuid.uuid4()
start_wait = time.time()
# --- Throttling ---
try:
# Check semaphore value without acquiring
current_usage = self._safe_pages - self._semaphore._value
usage_percent = (current_usage / self._safe_pages) * 100 if self._safe_pages > 0 else 0
if usage_percent >= self.config.throttle_threshold_percent:
delay = random.uniform(self.config.throttle_delay_min_s, self.config.throttle_delay_max_s)
self.logger.debug(f"Throttling: Usage {usage_percent:.1f}% >= {self.config.throttle_threshold_percent}%. Delaying {delay:.3f}s")
await asyncio.sleep(delay)
except Exception as e:
self.logger.warning(f"Error during throttling check: {e}") # Continue attempt even if throttle check fails
# --- Acquire Semaphore ---
try:
# self.logger.debug(f"Attempting to acquire semaphore (Available: {self._semaphore._value}/{self._safe_pages}). Wait Timeout: {self.config.max_wait_time_s}s")
# --- Logging Before Acquire ---
sem_value = self._semaphore._value if self._semaphore else 'N/A'
sem_waiters = len(self._semaphore._waiters) if self._semaphore and self._semaphore._waiters else 0
self.logger.debug(f"Req {request_id}: Attempting acquire. Available={sem_value}/{self._safe_pages}, Waiters={sem_waiters}, Timeout={self.config.max_wait_time_s}s")
await asyncio.wait_for(
self._semaphore.acquire(), timeout=self.config.max_wait_time_s
)
acquired = True
wait_duration = time.time() - start_wait
if wait_duration > 1:
self.logger.warning(f"Semaphore acquired after {wait_duration:.3f}s. (Available: {self._semaphore._value}/{self._safe_pages})")
self.logger.debug(f"Semaphore acquired successfully after {wait_duration:.3f}s. (Available: {self._semaphore._value}/{self._safe_pages})")
# --- Select Active Crawler (Critical Section) ---
async with self._state_lock:
current_active_index = self._active_crawler_index
is_primary_active = (current_active_index == 0)
if is_primary_active:
if self._primary_healthy and self._primary_crawler:
active_crawler = self._primary_crawler
else:
# Primary is supposed to be active but isn't healthy
self.logger.warning("Primary crawler unhealthy, attempting immediate failover...")
if not await self._try_failover_sync(): # Try to switch active crawler NOW
raise NoHealthyCrawlerError("Primary unhealthy and no healthy backup available.")
# If failover succeeded, active_crawler_index is updated
current_active_index = self._active_crawler_index
# Fall through to select the new active secondary
# Check if we need to use a secondary (either initially or after failover)
if current_active_index > 0:
secondary_idx = current_active_index - 1
if secondary_idx < len(self._secondary_crawlers) and \
self._secondary_healthy_flags[secondary_idx] and \
self._secondary_crawlers[secondary_idx]:
active_crawler = self._secondary_crawlers[secondary_idx]
else:
self.logger.error(f"Selected Secondary-{current_active_index} is unhealthy or missing.")
# Attempt failover to *another* secondary if possible? (Adds complexity)
# For now, raise error if the selected one isn't good.
raise NoHealthyCrawlerError(f"Selected Secondary-{current_active_index} is unavailable.")
if active_crawler is None:
# This shouldn't happen if logic above is correct, but safeguard
raise NoHealthyCrawlerError("Failed to select a healthy active crawler.")
# --- Yield Crawler ---
try:
yield active_crawler
except Exception as crawl_error:
self.logger.error(f"Error during crawl execution using {active_crawler}: {crawl_error}", exc_info=True)
# Determine if this error warrants failover
# For now, let's assume any exception triggers a health check/failover attempt
await self._handle_crawler_failure(active_crawler)
raise # Re-raise the original error for the API handler
except asyncio.TimeoutError:
self.logger.warning(f"Timeout waiting for semaphore after {self.config.max_wait_time_s}s.")
raise PoolTimeoutError(f"Timed out waiting for available crawler resource after {self.config.max_wait_time_s}s")
except NoHealthyCrawlerError:
# Logged within the selection logic
raise # Re-raise for API handler
except Exception as e:
self.logger.error(f"Unexpected error in get_crawler context manager: {e}", exc_info=True)
raise # Re-raise potentially unknown errors
finally:
if acquired:
self._semaphore.release()
self.logger.debug(f"Semaphore released. (Available: {self._semaphore._value}/{self._safe_pages})")
async def _try_failover_sync(self) -> bool:
"""Synchronous part of failover logic (must be called under state_lock). Finds next healthy secondary."""
if not self._primary_healthy: # Only failover if primary is already marked down
found_healthy_backup = False
start_idx = (self._active_crawler_index % (self.config.backup_pool_size +1)) # Start check after current
for i in range(self.config.backup_pool_size):
check_idx = (start_idx + i) % self.config.backup_pool_size # Circular check
if self._secondary_healthy_flags[check_idx] and self._secondary_crawlers[check_idx]:
self._active_crawler_index = check_idx + 1
self.logger.warning(f"Failover successful: Switched active crawler to Secondary-{self._active_crawler_index}")
found_healthy_backup = True
break # Found one
if not found_healthy_backup:
# If primary is down AND no backups are healthy, mark primary as active index (0) but it's still unhealthy
self._active_crawler_index = 0
self.logger.error("Failover failed: No healthy secondary crawlers available.")
return False
return True
return True # Primary is healthy, no failover needed
async def _handle_crawler_failure(self, failed_crawler: AsyncWebCrawler):
"""Handles marking a crawler as unhealthy and initiating recovery."""
if self._shutting_down: return # Don't handle failures during shutdown
async with self._state_lock:
crawler_index = -1
is_primary = False
if failed_crawler is self._primary_crawler and self._primary_healthy:
self.logger.warning("Primary crawler reported failure.")
self._primary_healthy = False
is_primary = True
crawler_index = 0
# Try immediate failover within the lock
await self._try_failover_sync()
# Start reload task if not already running for primary
if self._reload_tasks[0] is None or self._reload_tasks[0].done():
self.logger.info("Initiating primary crawler reload task.")
self._reload_tasks[0] = asyncio.create_task(self._reload_crawler(0))
else:
# Check if it was one of the secondaries
for i, crawler in enumerate(self._secondary_crawlers):
if failed_crawler is crawler and self._secondary_healthy_flags[i]:
self.logger.warning(f"Secondary-{i+1} crawler reported failure.")
self._secondary_healthy_flags[i] = False
is_primary = False
crawler_index = i + 1
# If this *was* the active crawler, trigger failover check
if self._active_crawler_index == crawler_index:
self.logger.warning(f"Active secondary {crawler_index} failed, attempting failover...")
await self._try_failover_sync()
# Start reload task for this secondary
if self._reload_tasks[crawler_index] is None or self._reload_tasks[crawler_index].done():
self.logger.info(f"Initiating Secondary-{i+1} crawler reload task.")
self._reload_tasks[crawler_index] = asyncio.create_task(self._reload_crawler(crawler_index))
break # Found the failed secondary
if crawler_index == -1:
self.logger.debug("Failure reported by an unknown or already unhealthy crawler instance. Ignoring.")
async def _reload_crawler(self, crawler_index_to_reload: int):
"""Background task to close, recreate, and start a specific crawler."""
is_primary = (crawler_index_to_reload == 0)
crawler_id = "Primary" if is_primary else f"Secondary-{crawler_index_to_reload}"
original_crawler = self._primary_crawler if is_primary else self._secondary_crawlers[crawler_index_to_reload - 1]
self.logger.info(f"Starting reload process for {crawler_id}...")
# 1. Delay before attempting reload (e.g., allow transient issues to clear)
if not is_primary: # Maybe shorter delay for backups?
await asyncio.sleep(self.config.primary_reload_delay_s / 2)
else:
await asyncio.sleep(self.config.primary_reload_delay_s)
# 2. Attempt to close the old instance cleanly
if original_crawler:
try:
self.logger.info(f"Attempting to close existing {crawler_id} instance...")
await original_crawler.close()
self.logger.info(f"Successfully closed old {crawler_id} instance.")
except Exception as e:
self.logger.warning(f"Error closing old {crawler_id} instance during reload: {e}")
# 3. Create and start a new instance
self.logger.info(f"Attempting to start new {crawler_id} instance...")
new_crawler = await self._create_and_start_crawler(crawler_id)
# 4. Update state if successful
async with self._state_lock:
if new_crawler:
self.logger.info(f"Successfully reloaded {crawler_id}. Marking as healthy.")
if is_primary:
self._primary_crawler = new_crawler
self._primary_healthy = True
# Switch back to primary if no other failures occurred
# Check if ANY secondary is currently active
secondary_is_active = self._active_crawler_index > 0
if not secondary_is_active or not self._secondary_healthy_flags[self._active_crawler_index - 1]:
self.logger.info("Switching active crawler back to primary.")
self._active_crawler_index = 0
else: # Is secondary
secondary_idx = crawler_index_to_reload - 1
self._secondary_crawlers[secondary_idx] = new_crawler
self._secondary_healthy_flags[secondary_idx] = True
# Potentially switch back if primary is still down and this was needed?
if not self._primary_healthy and self._active_crawler_index == 0:
self.logger.info(f"Primary still down, activating reloaded Secondary-{crawler_index_to_reload}.")
self._active_crawler_index = crawler_index_to_reload
else:
self.logger.error(f"Failed to reload {crawler_id}. It remains unhealthy.")
# Keep the crawler marked as unhealthy
if is_primary:
self._primary_healthy = False # Ensure it stays false
else:
self._secondary_healthy_flags[crawler_index_to_reload - 1] = False
# Clear the reload task reference for this index
self._reload_tasks[crawler_index_to_reload] = None
async def get_status(self) -> Dict:
"""Returns the current status of the manager."""
if not self.is_enabled():
return {"status": "disabled"}
async with self._state_lock:
active_id = "Primary" if self._active_crawler_index == 0 else f"Secondary-{self._active_crawler_index}"
primary_status = "Healthy" if self._primary_healthy else "Unhealthy"
secondary_statuses = [f"Secondary-{i+1}: {'Healthy' if healthy else 'Unhealthy'}"
for i, healthy in enumerate(self._secondary_healthy_flags)]
semaphore_available = self._semaphore._value if self._semaphore else 'N/A'
semaphore_locked = len(self._semaphore._waiters) if self._semaphore and self._semaphore._waiters else 0
return {
"status": "enabled",
"safe_pages": self._safe_pages,
"semaphore_available": semaphore_available,
"semaphore_waiters": semaphore_locked,
"active_crawler": active_id,
"primary_status": primary_status,
"secondary_statuses": secondary_statuses,
"reloading_tasks": [i for i, t in enumerate(self._reload_tasks) if t and not t.done()]
}

View File

@@ -0,0 +1,60 @@
# crawler_pool.py (new file)
import asyncio, json, hashlib, time, psutil
from contextlib import suppress
from typing import Dict
from crawl4ai import AsyncWebCrawler, BrowserConfig
from typing import Dict
from utils import load_config
CONFIG = load_config()
POOL: Dict[str, AsyncWebCrawler] = {}
LAST_USED: Dict[str, float] = {}
LOCK = asyncio.Lock()
MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0) # % RAM refuse new browsers above this
IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800) # close if unused for 30min
def _sig(cfg: BrowserConfig) -> str:
payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
return hashlib.sha1(payload.encode()).hexdigest()
async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
try:
sig = _sig(cfg)
async with LOCK:
if sig in POOL:
LAST_USED[sig] = time.time();
return POOL[sig]
if psutil.virtual_memory().percent >= MEM_LIMIT:
raise MemoryError("RAM pressure new browser denied")
crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
await crawler.start()
POOL[sig] = crawler; LAST_USED[sig] = time.time()
return crawler
except MemoryError as e:
raise MemoryError(f"RAM pressure new browser denied: {e}")
except Exception as e:
raise RuntimeError(f"Failed to start browser: {e}")
finally:
if sig in POOL:
LAST_USED[sig] = time.time()
else:
# If we failed to start the browser, we should remove it from the pool
POOL.pop(sig, None)
LAST_USED.pop(sig, None)
# If we failed to start the browser, we should remove it from the pool
async def close_all():
async with LOCK:
await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
POOL.clear(); LAST_USED.clear()
async def janitor():
while True:
await asyncio.sleep(60)
now = time.time()
async with LOCK:
for sig, crawler in list(POOL.items()):
if now - LAST_USED[sig] > IDLE_TTL:
with suppress(Exception): await crawler.close()
POOL.pop(sig, None); LAST_USED.pop(sig, None)

View File

@@ -1,167 +1,200 @@
# Import from auth.py
from auth import create_access_token, get_token_dependency, TokenRequest
from api import (
handle_markdown_request,
handle_llm_qa,
handle_stream_crawl_request,
handle_crawl_request,
stream_results,
_get_memory_mb
)
from utils import FilterType, load_config, setup_logging, verify_email_domain
import os
import sys
import time
from typing import List, Optional, Dict, AsyncGenerator
# ───────────────────────── server.py ─────────────────────────
"""
Crawl4AI FastAPI entrypoint
• Browser pool + global page cap
• Ratelimiting, security, metrics
• /crawl, /crawl/stream, /md, /llm endpoints
"""
# ── stdlib & 3rdparty imports ───────────────────────────────
import os, sys, time, asyncio
from typing import List, Optional, Dict
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException, Request, Query, Path, Depends, status
from fastapi.responses import StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
import pathlib
from fastapi import (
FastAPI, HTTPException, Request, Path, Query, Depends
)
from fastapi.responses import (
StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
)
from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
from fastapi.middleware.trustedhost import TrustedHostMiddleware
from fastapi.staticfiles import StaticFiles
import ast, crawl4ai as _c4
from pydantic import BaseModel, Field
from slowapi import Limiter
from slowapi.util import get_remote_address
from prometheus_fastapi_instrumentator import Instrumentator
from redis import asyncio as aioredis
from crawl4ai import (
BrowserConfig,
CrawlerRunConfig,
AsyncLogger
)
from crawler_manager import (
CrawlerManager,
CrawlerManagerConfig,
PoolTimeoutError,
NoHealthyCrawlerError
)
# ── internal imports (after sys.path append) ─────────────────
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from utils import (
FilterType, load_config, setup_logging, verify_email_domain
)
from api import (
handle_markdown_request, handle_llm_qa,
handle_stream_crawl_request, handle_crawl_request,
stream_results
)
from auth import create_access_token, get_token_dependency, TokenRequest
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawler_pool import get_crawler, close_all, janitor
__version__ = "0.2.6"
class CrawlRequest(BaseModel):
urls: List[str] = Field(min_length=1, max_length=100)
browser_config: Optional[Dict] = Field(default_factory=dict)
crawler_config: Optional[Dict] = Field(default_factory=dict)
# Load configuration and setup
# ────────────────── configuration / logging ──────────────────
config = load_config()
setup_logging(config)
logger = AsyncLogger(
log_file=config["logging"].get("log_file", "app.log"),
verbose=config["logging"].get("verbose", False),
tag_width=10,
)
# Initialize Redis
redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
__version__ = "0.5.1-d1"
# Initialize rate limiter
limiter = Limiter(
key_func=get_remote_address,
default_limits=[config["rate_limiting"]["default_limit"]],
storage_uri=config["rate_limiting"]["storage_uri"]
)
# ── global page semaphore (hard cap) ─────────────────────────
MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30)
GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
# --- Initialize Manager (will be done in lifespan) ---
# Load manager config from the main config
manager_config_dict = config.get("crawler_pool", {})
# Use Pydantic to parse and validate
manager_config = CrawlerManagerConfig(**manager_config_dict)
crawler_manager = CrawlerManager(config=manager_config, logger=logger)
# --- FastAPI App and Lifespan ---
# import logging
# page_log = logging.getLogger("page_cap")
# orig_arun = AsyncWebCrawler.arun
# async def capped_arun(self, *a, **kw):
# await GLOBAL_SEM.acquire() # ← take slot
# try:
# in_flight = MAX_PAGES - GLOBAL_SEM._value # used permits
# page_log.info("🕸️ pages_in_flight=%s / %s", in_flight, MAX_PAGES)
# return await orig_arun(self, *a, **kw)
# finally:
# GLOBAL_SEM.release() # ← free slot
orig_arun = AsyncWebCrawler.arun
async def capped_arun(self, *a, **kw):
async with GLOBAL_SEM:
return await orig_arun(self, *a, **kw)
AsyncWebCrawler.arun = capped_arun
# ───────────────────── FastAPI lifespan ──────────────────────
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup
logger.info("Starting up the server...")
if manager_config.enabled:
logger.info("Initializing Crawler Manager...")
await crawler_manager.initialize()
app.state.crawler_manager = crawler_manager # Store manager in app state
logger.info("Crawler Manager is enabled.")
else:
logger.warning("Crawler Manager is disabled.")
app.state.crawler_manager = None # Indicate disabled state
yield # Server runs here
# Shutdown
logger.info("Shutting down server...")
if app.state.crawler_manager:
logger.info("Shutting down Crawler Manager...")
await app.state.crawler_manager.shutdown()
logger.info("Crawler Manager shut down.")
logger.info("Server shut down.")
async def lifespan(_: FastAPI):
await get_crawler(BrowserConfig(
extra_args=config["crawler"]["browser"].get("extra_args", []),
**config["crawler"]["browser"].get("kwargs", {}),
)) # warmup
app.state.janitor = asyncio.create_task(janitor()) # idle GC
yield
app.state.janitor.cancel()
await close_all()
# ───────────────────── FastAPI instance ──────────────────────
app = FastAPI(
title=config["app"]["title"],
version=config["app"]["version"],
lifespan=lifespan,
)
# Configure middleware
def setup_security_middleware(app, config):
sec_config = config.get("security", {})
if sec_config.get("enabled", False):
if sec_config.get("https_redirect", False):
app.add_middleware(HTTPSRedirectMiddleware)
if sec_config.get("trusted_hosts", []) != ["*"]:
app.add_middleware(TrustedHostMiddleware,
allowed_hosts=sec_config["trusted_hosts"])
# ── static playground ──────────────────────────────────────
STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground"
if not STATIC_DIR.exists():
raise RuntimeError(f"Playground assets not found at {STATIC_DIR}")
app.mount(
"/playground",
StaticFiles(directory=STATIC_DIR, html=True),
name="play",
)
# Optional nicetohave: opening the root shows the playground
@app.get("/")
async def root():
return RedirectResponse("/playground")
setup_security_middleware(app, config)
# ─────────────────── infra / middleware ─────────────────────
redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
limiter = Limiter(
key_func=get_remote_address,
default_limits=[config["rate_limiting"]["default_limit"]],
storage_uri=config["rate_limiting"]["storage_uri"],
)
def _setup_security(app_: FastAPI):
sec = config["security"]
if not sec["enabled"]:
return
if sec.get("https_redirect"):
app_.add_middleware(HTTPSRedirectMiddleware)
if sec.get("trusted_hosts", []) != ["*"]:
app_.add_middleware(
TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"]
)
_setup_security(app)
# Prometheus instrumentation
if config["observability"]["prometheus"]["enabled"]:
Instrumentator().instrument(app).expose(app)
# Get token dependency based on config
token_dependency = get_token_dependency(config)
# Middleware for security headers
token_dep = get_token_dependency(config)
@app.middleware("http")
async def add_security_headers(request: Request, call_next):
response = await call_next(request)
resp = await call_next(request)
if config["security"]["enabled"]:
response.headers.update(config["security"]["headers"])
return response
resp.headers.update(config["security"]["headers"])
return resp
# ───────────────── safe configdump helper ─────────────────
ALLOWED_TYPES = {
"CrawlerRunConfig": CrawlerRunConfig,
"BrowserConfig": BrowserConfig,
}
def _safe_eval_config(expr: str) -> dict:
"""
Accept exactly one toplevel call to CrawlerRunConfig(...) or BrowserConfig(...).
Whatever is inside the parentheses is fine *except* further function calls
(so no __import__('os') stuff). All public names from crawl4ai are available
when we eval.
"""
tree = ast.parse(expr, mode="eval")
# must be a single call
if not isinstance(tree.body, ast.Call):
raise ValueError("Expression must be a single constructor call")
call = tree.body
if not (isinstance(call.func, ast.Name) and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}):
raise ValueError("Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
# forbid nested calls to keep the surface tiny
for node in ast.walk(call):
if isinstance(node, ast.Call) and node is not call:
raise ValueError("Nested function calls are not permitted")
# expose everything that crawl4ai exports, nothing else
safe_env = {name: getattr(_c4, name) for name in dir(_c4) if not name.startswith("_")}
obj = eval(compile(tree, "<config>", "eval"), {"__builtins__": {}}, safe_env)
return obj.dump()
async def get_manager() -> CrawlerManager:
# Ensure manager exists and is enabled before yielding
if not hasattr(app.state, 'crawler_manager') or app.state.crawler_manager is None:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Crawler service is disabled or not initialized"
)
if not app.state.crawler_manager.is_enabled():
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Crawler service is currently disabled"
)
return app.state.crawler_manager
# Token endpoint (always available, but usage depends on config)
# ───────────────────────── Schemas ───────────────────────────
class CrawlRequest(BaseModel):
urls: List[str] = Field(min_length=1, max_length=100)
browser_config: Optional[Dict] = Field(default_factory=dict)
crawler_config: Optional[Dict] = Field(default_factory=dict)
class RawCode(BaseModel):
code: str
# ──────────────────────── Endpoints ──────────────────────────
@app.post("/token")
async def get_token(request_data: TokenRequest):
if not verify_email_domain(request_data.email):
raise HTTPException(status_code=400, detail="Invalid email domain")
token = create_access_token({"sub": request_data.email})
return {"email": request_data.email, "access_token": token, "token_type": "bearer"}
async def get_token(req: TokenRequest):
if not verify_email_domain(req.email):
raise HTTPException(400, "Invalid email domain")
token = create_access_token({"sub": req.email})
return {"email": req.email, "access_token": token, "token_type": "bearer"}
# Endpoints with conditional auth
@app.post("/config/dump")
async def config_dump(raw: RawCode):
try:
return JSONResponse(_safe_eval_config(raw.code.strip()))
except Exception as e:
raise HTTPException(400, str(e))
@app.get("/md/{url:path}")
@@ -171,230 +204,83 @@ async def get_markdown(
url: str,
f: FilterType = FilterType.FIT,
q: Optional[str] = None,
c: Optional[str] = "0",
token_data: Optional[Dict] = Depends(token_dependency)
c: str = "0",
_td: Dict = Depends(token_dep),
):
result = await handle_markdown_request(url, f, q, c, config)
return PlainTextResponse(result)
md = await handle_markdown_request(url, f, q, c, config)
return PlainTextResponse(md)
@app.get("/llm/{url:path}", description="URL should be without http/https prefix")
@app.get("/llm/{url:path}")
async def llm_endpoint(
request: Request,
url: str = Path(...),
q: Optional[str] = Query(None),
token_data: Optional[Dict] = Depends(token_dependency)
_td: Dict = Depends(token_dep),
):
if not q:
raise HTTPException(
status_code=400, detail="Query parameter 'q' is required")
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
try:
answer = await handle_llm_qa(url, q, config)
return JSONResponse({"answer": answer})
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
raise HTTPException(400, "Query parameter 'q' is required")
if not url.startswith(("http://", "https://")):
url = "https://" + url
answer = await handle_llm_qa(url, q, config)
return JSONResponse({"answer": answer})
@app.get("/schema")
async def get_schema():
from crawl4ai import BrowserConfig, CrawlerRunConfig
return {"browser": BrowserConfig().dump(), "crawler": CrawlerRunConfig().dump()}
return {"browser": BrowserConfig().dump(),
"crawler": CrawlerRunConfig().dump()}
@app.get(config["observability"]["health_check"]["endpoint"])
async def health():
return {"status": "ok", "timestamp": time.time(), "version": __version__}
@app.get(config["observability"]["prometheus"]["endpoint"])
async def metrics():
return RedirectResponse(url=config["observability"]["prometheus"]["endpoint"])
@app.get("/browswers")
# Optional dependency
async def health(manager: Optional[CrawlerManager] = Depends(get_manager, use_cache=False)):
base_status = {"status": "ok", "timestamp": time.time(),
"version": __version__}
if manager:
try:
manager_status = await manager.get_status()
base_status["crawler_manager"] = manager_status
except Exception as e:
base_status["crawler_manager"] = {
"status": "error", "detail": str(e)}
else:
base_status["crawler_manager"] = {"status": "disabled"}
return base_status
return RedirectResponse(config["observability"]["prometheus"]["endpoint"])
@app.post("/crawl")
@limiter.limit(config["rate_limiting"]["default_limit"])
async def crawl(
request: Request,
crawl_request: CrawlRequest,
manager: CrawlerManager = Depends(get_manager), # Use dependency
token_data: Optional[Dict] = Depends(token_dependency) # Keep auth
_td: Dict = Depends(token_dep),
):
if not crawl_request.urls:
raise HTTPException(
status_code=400, detail="At least one URL required")
try:
# Use the manager's context to get a crawler instance
async with manager.get_crawler() as active_crawler:
# Call the actual handler from api.py, passing the acquired crawler
results_dict = await handle_crawl_request(
crawler=active_crawler, # Pass the live crawler instance
urls=crawl_request.urls,
# Pass user-provided configs, these might override pool defaults if needed
# Or the manager/handler could decide how to merge them
browser_config=crawl_request.browser_config or {}, # Ensure dict
crawler_config=crawl_request.crawler_config or {}, # Ensure dict
config=config # Pass the global server config
)
return JSONResponse(results_dict)
except PoolTimeoutError as e:
logger.warning(f"Request rejected due to pool timeout: {e}")
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE, # Or 429
detail=f"Crawler resources busy. Please try again later. Timeout: {e}"
)
except NoHealthyCrawlerError as e:
logger.error(f"Request failed as no healthy crawler available: {e}")
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail=f"Crawler service temporarily unavailable: {e}"
)
except HTTPException: # Re-raise HTTP exceptions from handler
raise
except Exception as e:
logger.error(
f"Unexpected error during batch crawl processing: {e}", exc_info=True)
# Return generic error, details might be logged by handle_crawl_request
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"An unexpected error occurred: {e}"
)
raise HTTPException(400, "At least one URL required")
res = await handle_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config,
)
return JSONResponse(res)
@app.post("/crawl/stream")
@limiter.limit(config["rate_limiting"]["default_limit"])
async def crawl_stream(
request: Request,
crawl_request: CrawlRequest,
manager: CrawlerManager = Depends(get_manager),
token_data: Optional[Dict] = Depends(token_dependency)
_td: Dict = Depends(token_dep),
):
if not crawl_request.urls:
raise HTTPException(
status_code=400, detail="At least one URL required")
try:
# THIS IS A BIT WORK OF ART RATHER THAN ENGINEERING
# Acquire the crawler context from the manager
# IMPORTANT: The context needs to be active for the *duration* of the stream
# This structure might be tricky with FastAPI's StreamingResponse which consumes
# the generator *after* the endpoint function returns.
# --- Option A: Acquire crawler, pass to handler, handler yields ---
# (Requires handler NOT to be async generator itself, but return one)
# async with manager.get_crawler() as active_crawler:
# # Handler returns the generator
# _, results_gen = await handle_stream_crawl_request(
# crawler=active_crawler,
# urls=crawl_request.urls,
# browser_config=crawl_request.browser_config or {},
# crawler_config=crawl_request.crawler_config or {},
# config=config
# )
# # PROBLEM: `active_crawler` context exits before StreamingResponse uses results_gen
# # This releases the semaphore too early.
# --- Option B: Pass manager to handler, handler uses context internally ---
# (Requires modifying handle_stream_crawl_request signature/logic)
# This seems cleaner. Let's assume api.py is adapted for this.
# We need a way for the generator yielded by stream_results to know when
# to release the semaphore.
# --- Option C: Create a wrapper generator that handles context ---
async def stream_wrapper(manager: CrawlerManager, crawl_request: CrawlRequest, config: dict) -> AsyncGenerator[bytes, None]:
active_crawler = None
try:
async with manager.get_crawler() as acquired_crawler:
active_crawler = acquired_crawler # Keep reference for cleanup
# Call the handler which returns the raw result generator
_crawler_ref, results_gen = await handle_stream_crawl_request(
crawler=acquired_crawler,
urls=crawl_request.urls,
browser_config=crawl_request.browser_config or {},
crawler_config=crawl_request.crawler_config or {},
config=config
)
# Use the stream_results utility to format and yield
async for data_bytes in stream_results(_crawler_ref, results_gen):
yield data_bytes
except (PoolTimeoutError, NoHealthyCrawlerError) as e:
# Yield a final error message in the stream
error_payload = {"status": "error", "detail": str(e)}
yield (json.dumps(error_payload) + "\n").encode('utf-8')
logger.warning(f"Stream request failed: {e}")
# Re-raise might be better if StreamingResponse handles it? Test needed.
except HTTPException as e: # Catch HTTP exceptions from handler setup
error_payload = {"status": "error",
"detail": e.detail, "status_code": e.status_code}
yield (json.dumps(error_payload) + "\n").encode('utf-8')
logger.warning(
f"Stream request failed with HTTPException: {e.detail}")
except Exception as e:
error_payload = {"status": "error",
"detail": f"Unexpected stream error: {e}"}
yield (json.dumps(error_payload) + "\n").encode('utf-8')
logger.error(
f"Unexpected error during stream processing: {e}", exc_info=True)
# finally:
# Ensure crawler cleanup if stream_results doesn't handle it?
# stream_results *should* call crawler.close(), but only on the
# instance it received. If we pass the *manager* instead, this gets complex.
# Let's stick to passing the acquired_crawler and rely on stream_results.
# Create the generator using the wrapper
streaming_generator = stream_wrapper(manager, crawl_request, config)
return StreamingResponse(
streaming_generator, # Use the wrapper
media_type='application/x-ndjson',
headers={'Cache-Control': 'no-cache',
'Connection': 'keep-alive', 'X-Stream-Status': 'active'}
)
except (PoolTimeoutError, NoHealthyCrawlerError) as e:
# These might occur if get_crawler fails *before* stream starts
# Or if the wrapper re-raises them.
logger.warning(f"Stream request rejected before starting: {e}")
status_code = status.HTTP_503_SERVICE_UNAVAILABLE # Or 429 for timeout
# Don't raise HTTPException here, let the wrapper yield the error message.
# If we want to return a non-200 initial status, need more complex handling.
# Return an *empty* stream with error headers? Or just let wrapper yield error.
async def _error_stream():
error_payload = {"status": "error", "detail": str(e)}
yield (json.dumps(error_payload) + "\n").encode('utf-8')
return StreamingResponse(_error_stream(), status_code=status_code, media_type='application/x-ndjson')
except HTTPException: # Re-raise HTTP exceptions from setup
raise
except Exception as e:
logger.error(
f"Unexpected error setting up stream crawl: {e}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"An unexpected error occurred setting up the stream: {e}"
)
raise HTTPException(400, "At least one URL required")
crawler, gen = await handle_stream_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config,
)
return StreamingResponse(
stream_results(crawler, gen),
media_type="application/x-ndjson",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Stream-Status": "active",
},
)
# ────────────────────────── cli ──────────────────────────────
if __name__ == "__main__":
import uvicorn
uvicorn.run(
@@ -402,5 +288,6 @@ if __name__ == "__main__":
host=config["app"]["host"],
port=config["app"]["port"],
reload=config["app"]["reload"],
timeout_keep_alive=config["app"]["timeout_keep_alive"]
timeout_keep_alive=config["app"]["timeout_keep_alive"],
)
# ─────────────────────────────────────────────────────────────

View File

@@ -0,0 +1,813 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Crawl4AI Playground</title>
<script src="https://cdn.tailwindcss.com"></script>
<script>
tailwind.config = {
theme: {
extend: {
colors: {
primary: '#4EFFFF',
primarydim: '#09b5a5',
accent: '#F380F5',
dark: '#070708',
light: '#E8E9ED',
secondary: '#D5CEBF',
codebg: '#1E1E1E',
surface: '#202020',
border: '#3F3F44',
},
fontFamily: {
mono: ['Fira Code', 'monospace'],
},
}
}
}
</script>
<link href="https://fonts.googleapis.com/css2?family=Fira+Code:wght@400;500&display=swap" rel="stylesheet">
<!-- Highlight.js -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/github-dark.min.css">
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.11/clipboard.min.js"></script>
<!-- CodeMirror (python mode) -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/codemirror.min.css">
<script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/codemirror.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/mode/python/python.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/addon/edit/matchbrackets.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/addon/selection/active-line.min.js"></script>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/theme/darcula.min.css">
<!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/python.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/bash.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/json.min.js"></script> -->
<style>
/* Custom CodeMirror styling to match theme */
.CodeMirror {
background-color: #1E1E1E !important;
color: #E8E9ED !important;
border-radius: 4px;
font-family: 'Fira Code', monospace;
font-size: 0.9rem;
}
.CodeMirror-gutters {
background-color: #1E1E1E !important;
border-right: 1px solid #3F3F44 !important;
}
.CodeMirror-linenumber {
color: #3F3F44 !important;
}
.cm-s-darcula .cm-keyword {
color: #4EFFFF !important;
}
.cm-s-darcula .cm-string {
color: #F380F5 !important;
}
.cm-s-darcula .cm-number {
color: #D5CEBF !important;
}
/* Add to your <style> section or Tailwind config */
.hljs {
background: #1E1E1E !important;
border-radius: 4px;
padding: 1rem !important;
}
pre code.hljs {
display: block;
overflow-x: auto;
}
/* Language-specific colors */
.hljs-attr {
color: #4EFFFF;
}
/* JSON keys */
.hljs-string {
color: #F380F5;
}
/* Strings */
.hljs-number {
color: #D5CEBF;
}
/* Numbers */
.hljs-keyword {
color: #4EFFFF;
}
pre code {
white-space: pre-wrap;
word-break: break-word;
}
.copy-btn {
transition: all 0.2s ease;
opacity: 0.7;
}
.copy-btn:hover {
opacity: 1;
}
.tab-content:hover .copy-btn {
opacity: 0.7;
}
.tab-content:hover .copy-btn:hover {
opacity: 1;
}
/* copid text highlighted */
.highlighted {
background-color: rgba(78, 255, 255, 0.2) !important;
transition: background-color 0.5s ease;
}
</style>
</head>
<body class="bg-dark text-light font-mono min-h-screen flex flex-col" style="font-feature-settings: 'calt' 0;">
<!-- Header -->
<header class="border-b border-border px-4 py-2 flex items-center">
<h1 class="text-lg font-medium flex items-center space-x-4">
<span>🚀🤖 <span class="text-primary">Crawl4AI</span> Playground</span>
<!-- GitHub badges -->
<a href="https://github.com/unclecode/crawl4ai" target="_blank" class="flex space-x-1">
<img src="https://img.shields.io/github/stars/unclecode/crawl4ai?style=social"
alt="GitHub stars" class="h-5">
<img src="https://img.shields.io/github/forks/unclecode/crawl4ai?style=social"
alt="GitHub forks" class="h-5">
</a>
<!-- Docs -->
<a href="https://docs.crawl4ai.com" target="_blank"
class="text-xs text-secondary hover:text-primary underline flex items-center">
Docs
</a>
<!-- X (Twitter) follow -->
<a href="https://x.com/unclecode" target="_blank"
class="hover:text-primary flex items-center" title="Follow @unclecode on X">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"
class="w-4 h-4 fill-current mr-1">
<path d="M22.46 6c-.77.35-1.6.58-2.46.69a4.27 4.27 0 001.88-2.35 8.53 8.53 0 01-2.71 1.04 4.24 4.24 0 00-7.23 3.87A12.05 12.05 0 013 4.62a4.24 4.24 0 001.31 5.65 4.2 4.2 0 01-1.92-.53v.05a4.24 4.24 0 003.4 4.16 4.31 4.31 0 01-1.91.07 4.25 4.25 0 003.96 2.95A8.5 8.5 0 012 19.55a12.04 12.04 0 006.53 1.92c7.84 0 12.13-6.49 12.13-12.13 0-.18-.01-.36-.02-.54A8.63 8.63 0 0024 5.1a8.45 8.45 0 01-2.54.7z"/>
</svg>
<span class="text-xs">@unclecode</span>
</a>
</h1>
<div class="ml-auto flex space-x-2">
<button id="play-tab"
class="px-3 py-1 rounded-t bg-surface border border-b-0 border-border text-primary">Playground</button>
<button id="stress-tab" class="px-3 py-1 rounded-t border border-border hover:bg-surface">Stress
Test</button>
</div>
</header>
<!-- Main Playground -->
<main id="playground" class="flex-1 flex flex-col p-4 space-y-4 max-w-5xl w-full mx-auto">
<!-- Request Builder -->
<section class="bg-surface rounded-lg border border-border overflow-hidden">
<div class="px-4 py-2 border-b border-border flex items-center">
<h2 class="font-medium">Request Builder</h2>
<select id="endpoint" class="ml-auto bg-dark border border-border rounded px-2 py-1 text-sm">
<option value="crawl">/crawl (batch)</option>
<option value="crawl_stream">/crawl/stream</option>
<option value="md">/md</option>
<option value="llm">/llm</option>
</select>
</div>
<div class="p-4">
<label class="block mb-2 text-sm">URL(s) - one per line</label>
<textarea id="urls" class="w-full bg-dark border border-border rounded p-2 h-32 text-sm mb-4"
spellcheck="false">https://example.com</textarea>
<details class="mb-4">
<summary class="text-sm text-secondary cursor-pointer">Advanced Config <span
class="text-xs text-primary">(Python → autoJSON)</span></summary>
<!-- Toolbar -->
<div class="flex items-center justify-end space-x-3 mt-2">
<label for="cfg-type" class="text-xs text-secondary">Type:</label>
<select id="cfg-type"
class="bg-dark border border-border rounded px-1 py-0.5 text-xs">
<option value="CrawlerRunConfig">CrawlerRunConfig</option>
<option value="BrowserConfig">BrowserConfig</option>
</select>
<!-- help link -->
<a href="https://docs.crawl4ai.com/api/parameters/"
target="_blank"
class="text-xs text-primary hover:underline flex items-center space-x-1"
title="Open parameter reference in new tab">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"
class="w-4 h-4 fill-current">
<path d="M13 3h8v8h-2V6.41l-9.29 9.3-1.42-1.42 9.3-9.29H13V3z"/>
<path d="M5 5h4V3H3v6h2V5zm0 14v-4H3v6h6v-2H5z"/>
</svg>
<span>Docs</span>
</a>
<span id="cfg-status" class="text-xs text-secondary ml-2"></span>
</div>
<!-- CodeMirror host -->
<div id="adv-editor" class="mt-2 border border-border rounded overflow-hidden h-40"></div>
</details>
<div class="flex space-x-2">
<button id="run-btn" class="bg-primary text-dark px-4 py-2 rounded hover:bg-primarydim font-medium">
Run (⌘/Ctrl+Enter)
</button>
<button id="export-btn" class="border border-border px-4 py-2 rounded hover:bg-surface hidden">
Export Python Code
</button>
</div>
</div>
</section>
<!-- Execution Status -->
<section id="execution-status" class="hidden bg-surface rounded-lg border border-border p-3 text-sm">
<div class="flex space-x-4">
<div id="status-badge" class="flex items-center">
<span class="w-3 h-3 rounded-full mr-2"></span>
<span>Ready</span>
</div>
<div>
<span class="text-secondary">Time:</span>
<span id="exec-time" class="text-light">-</span>
</div>
<div>
<span class="text-secondary">Memory:</span>
<span id="exec-mem" class="text-light">-</span>
</div>
</div>
</section>
<!-- Response Viewer -->
<!-- Update the Response Viewer section -->
<section class="bg-surface rounded-lg border border-border overflow-hidden flex-1 flex flex-col">
<div class="border-b border-border flex">
<button data-tab="response" class="tab-btn active px-4 py-2 border-r border-border">Response</button>
<button data-tab="python" class="tab-btn px-4 py-2 border-r border-border">Python</button>
<button data-tab="curl" class="tab-btn px-4 py-2">cURL</button>
</div>
<div class="flex-1 overflow-auto relative">
<!-- Response Tab -->
<div class="tab-content active h-full">
<div class="absolute right-2 top-2">
<button class="copy-btn bg-surface border border-border rounded px-2 py-1 text-xs hover:bg-dark"
data-target="#response-content code">
Copy
</button>
</div>
<pre id="response-content" class="p-4 text-sm h-full"><code class="json hljs">{}</code></pre>
</div>
<!-- Python Tab -->
<div class="tab-content hidden h-full">
<div class="absolute right-2 top-2">
<button class="copy-btn bg-surface border border-border rounded px-2 py-1 text-xs hover:bg-dark"
data-target="#python-content code">
Copy
</button>
</div>
<pre id="python-content" class="p-4 text-sm h-full"><code class="python hljs"></code></pre>
</div>
<!-- cURL Tab -->
<div class="tab-content hidden h-full">
<div class="absolute right-2 top-2">
<button class="copy-btn bg-surface border border-border rounded px-2 py-1 text-xs hover:bg-dark"
data-target="#curl-content code">
Copy
</button>
</div>
<pre id="curl-content" class="p-4 text-sm h-full"><code class="bash hljs"></code></pre>
</div>
</div>
</section>
</main>
<!-- Stress Test Modal -->
<div id="stress-modal"
class="hidden fixed inset-0 bg-black bg-opacity-70 z-50 flex items-center justify-center p-4">
<div class="bg-surface rounded-lg border border-accent w-full max-w-3xl max-h-[90vh] flex flex-col">
<div class="px-4 py-2 border-b border-border flex items-center">
<h2 class="font-medium text-accent">🔥 Stress Test</h2>
<button id="close-stress" class="ml-auto text-secondary hover:text-light">&times;</button>
</div>
<div class="p-4 space-y-4 flex-1 overflow-auto">
<div class="grid grid-cols-3 gap-4">
<div>
<label class="block text-sm mb-1">Total URLs</label>
<input id="st-total" type="number" value="20"
class="w-full bg-dark border border-border rounded px-3 py-1">
</div>
<div>
<label class="block text-sm mb-1">Chunk Size</label>
<input id="st-chunk" type="number" value="5"
class="w-full bg-dark border border-border rounded px-3 py-1">
</div>
<div>
<label class="block text-sm mb-1">Concurrency</label>
<input id="st-conc" type="number" value="2"
class="w-full bg-dark border border-border rounded px-3 py-1">
</div>
</div>
<div class="flex items-center">
<input id="st-stream" type="checkbox" class="mr-2">
<label for="st-stream" class="text-sm">Use /crawl/stream</label>
<button id="st-run"
class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
Run Stress Test
</button>
</div>
<div class="mt-4">
<div class="bg-dark rounded border border-border p-3 h-64 overflow-auto text-sm whitespace-break-spaces"
id="stress-log"></div>
</div>
</div>
<div class="px-4 py-2 border-t border-border text-sm text-secondary">
<div class="flex justify-between">
<span>Completed: <span id="stress-completed">0</span>/<span id="stress-total">0</span></span>
<span>Avg. Time: <span id="stress-avg-time">0</span>ms</span>
<span>Peak Memory: <span id="stress-peak-mem">0</span>MB</span>
</div>
</div>
</div>
</div>
<script>
// Tab switching
document.querySelectorAll('.tab-btn').forEach(btn => {
btn.addEventListener('click', () => {
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
document.querySelectorAll('.tab-content').forEach(c => c.classList.add('hidden'));
btn.classList.add('active');
const tabName = btn.dataset.tab;
document.querySelector(`#${tabName}-content`).parentElement.classList.remove('hidden');
// Re-highlight content when switching tabs
const activeCode = document.querySelector(`#${tabName}-content code`);
if (activeCode) {
forceHighlightElement(activeCode);
}
});
});
// View switching
document.getElementById('play-tab').addEventListener('click', () => {
document.getElementById('playground').classList.remove('hidden');
document.getElementById('stress-modal').classList.add('hidden');
document.getElementById('play-tab').classList.add('bg-surface', 'border-b-0');
document.getElementById('stress-tab').classList.remove('bg-surface', 'border-b-0');
});
document.getElementById('stress-tab').addEventListener('click', () => {
document.getElementById('stress-modal').classList.remove('hidden');
document.getElementById('stress-tab').classList.add('bg-surface', 'border-b-0');
document.getElementById('play-tab').classList.remove('bg-surface', 'border-b-0');
});
document.getElementById('close-stress').addEventListener('click', () => {
document.getElementById('stress-modal').classList.add('hidden');
document.getElementById('play-tab').classList.add('bg-surface', 'border-b-0');
document.getElementById('stress-tab').classList.remove('bg-surface', 'border-b-0');
});
// Initialize clipboard and highlight.js
new ClipboardJS('#export-btn');
hljs.highlightAll();
// Keyboard shortcut
window.addEventListener('keydown', e => {
if ((e.ctrlKey || e.metaKey) && e.key === 'Enter') {
document.getElementById('run-btn').click();
}
});
// ================ ADVANCED CONFIG EDITOR ================
const cm = CodeMirror(document.getElementById('adv-editor'), {
value: `CrawlerRunConfig(
stream=True,
cache_mode=CacheMode.BYPASS,
)`,
mode: 'python',
lineNumbers: true,
theme: 'darcula',
tabSize: 4,
styleActiveLine: true,
matchBrackets: true,
gutters: ["CodeMirror-linenumbers"],
lineWrapping: true,
});
const TEMPLATES = {
CrawlerRunConfig: `CrawlerRunConfig(
stream=True,
cache_mode=CacheMode.BYPASS,
)`,
BrowserConfig: `BrowserConfig(
headless=True,
extra_args=[
"--no-sandbox",
"--disable-gpu",
],
)`,
};
document.getElementById('cfg-type').addEventListener('change', (e) => {
cm.setValue(TEMPLATES[e.target.value]);
document.getElementById('cfg-status').textContent = '';
});
async function pyConfigToJson() {
const code = cm.getValue().trim();
if (!code) return {};
const res = await fetch('/config/dump', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ code }),
});
const statusEl = document.getElementById('cfg-status');
if (!res.ok) {
const msg = await res.text();
statusEl.textContent = '✖ config error';
statusEl.className = 'text-xs text-red-400';
throw new Error(msg || 'Invalid config');
}
statusEl.textContent = '✓ parsed';
statusEl.className = 'text-xs text-green-400';
return await res.json();
}
// ================ SERVER COMMUNICATION ================
// Update status UI
function updateStatus(status, time, memory, peakMemory) {
const statusEl = document.getElementById('execution-status');
const badgeEl = document.querySelector('#status-badge span:first-child');
const textEl = document.querySelector('#status-badge span:last-child');
statusEl.classList.remove('hidden');
badgeEl.className = 'w-3 h-3 rounded-full mr-2';
if (status === 'success') {
badgeEl.classList.add('bg-green-500');
textEl.textContent = 'Success';
} else if (status === 'error') {
badgeEl.classList.add('bg-red-500');
textEl.textContent = 'Error';
} else {
badgeEl.classList.add('bg-yellow-500');
textEl.textContent = 'Processing...';
}
if (time) {
document.getElementById('exec-time').textContent = `${time}ms`;
}
if (memory !== undefined && peakMemory !== undefined) {
document.getElementById('exec-mem').textContent = `Δ${memory >= 0 ? '+' : ''}${memory}MB (Peak: ${peakMemory}MB)`;
}
}
// Generate code snippets
function generateSnippets(api, payload) {
// Python snippet
const pyCodeEl = document.querySelector('#python-content code');
const pySnippet = `import httpx\n\nasync def crawl():\n async with httpx.AsyncClient() as client:\n response = await client.post(\n "${window.location.origin}${api}",\n json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n ')}\n )\n return response.json()`;
pyCodeEl.textContent = pySnippet;
pyCodeEl.className = 'python hljs'; // Reset classes
forceHighlightElement(pyCodeEl);
// cURL snippet
const curlCodeEl = document.querySelector('#curl-content code');
const curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n -H "Content-Type: application/json" \\\n -d '${JSON.stringify(payload)}'`;
curlCodeEl.textContent = curlSnippet;
curlCodeEl.className = 'bash hljs'; // Reset classes
forceHighlightElement(curlCodeEl);
}
// Main run function
async function runCrawl() {
const endpoint = document.getElementById('endpoint').value;
const urls = document.getElementById('urls').value.trim().split(/\n/).filter(u => u);
// 1) grab python from CodeMirror, validate via /config/dump
let advConfig = {};
try {
const cfgJson = await pyConfigToJson(); // may throw
if (Object.keys(cfgJson).length) {
const cfgType = document.getElementById('cfg-type').value;
advConfig = cfgType === 'CrawlerRunConfig'
? { crawler_config: cfgJson }
: { browser_config: cfgJson };
}
} catch (err) {
updateStatus('error');
document.querySelector('#response-content code').textContent =
JSON.stringify({ error: err.message }, null, 2);
forceHighlightElement(document.querySelector('#response-content code'));
return; // stop run
}
const endpointMap = {
crawl: '/crawl',
crawl_stream: '/crawl/stream',
md: '/md',
llm: '/llm'
};
const api = endpointMap[endpoint];
const payload = {
urls,
...advConfig
};
updateStatus('processing');
try {
const startTime = performance.now();
let response, responseData;
if (endpoint === 'crawl_stream') {
// Stream processing
response = await fetch(api, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload)
});
const reader = response.body.getReader();
let text = '';
let maxMemory = 0;
while (true) {
const { value, done } = await reader.read();
if (done) break;
const chunk = new TextDecoder().decode(value);
text += chunk;
// Process each line for memory updates
chunk.trim().split('\n').forEach(line => {
if (!line) return;
try {
const obj = JSON.parse(line);
if (obj.server_memory_mb) {
maxMemory = Math.max(maxMemory, obj.server_memory_mb);
}
} catch (e) {
console.error('Error parsing stream line:', e);
}
});
}
responseData = { stream: text };
const time = Math.round(performance.now() - startTime);
updateStatus('success', time, null, maxMemory);
document.querySelector('#response-content code').textContent = text;
document.querySelector('#response-content code').className = 'json hljs'; // Reset classes
forceHighlightElement(document.querySelector('#response-content code'));
} else {
// Regular request
response = await fetch(api, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload)
});
responseData = await response.json();
const time = Math.round(performance.now() - startTime);
if (!response.ok) {
updateStatus('error', time);
throw new Error(responseData.error || 'Request failed');
}
updateStatus(
'success',
time,
responseData.server_memory_delta_mb,
responseData.server_peak_memory_mb
);
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
document.querySelector('#response-content code').className = 'json hljs'; // Ensure class is set
forceHighlightElement(document.querySelector('#response-content code'));
}
forceHighlightElement(document.querySelector('#response-content code'));
generateSnippets(api, payload);
} catch (error) {
console.error('Error:', error);
updateStatus('error');
document.querySelector('#response-content code').textContent = JSON.stringify(
{ error: error.message },
null,
2
);
forceHighlightElement(document.querySelector('#response-content code'));
}
}
// Stress test function
async function runStressTest() {
const total = parseInt(document.getElementById('st-total').value);
const chunkSize = parseInt(document.getElementById('st-chunk').value);
const concurrency = parseInt(document.getElementById('st-conc').value);
const useStream = document.getElementById('st-stream').checked;
const logEl = document.getElementById('stress-log');
logEl.textContent = '';
document.getElementById('stress-completed').textContent = '0';
document.getElementById('stress-total').textContent = total;
document.getElementById('stress-avg-time').textContent = '0';
document.getElementById('stress-peak-mem').textContent = '0';
const api = useStream ? '/crawl/stream' : '/crawl';
const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
const chunks = [];
for (let i = 0; i < urls.length; i += chunkSize) {
chunks.push(urls.slice(i, i + chunkSize));
}
let completed = 0;
let totalTime = 0;
let peakMemory = 0;
const processBatch = async (batch, index) => {
const payload = {
urls: batch,
browser_config: {},
crawler_config: { cache_mode: 'BYPASS', stream: useStream }
};
const start = performance.now();
let time, memory;
try {
if (useStream) {
const response = await fetch(api, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload)
});
const reader = response.body.getReader();
let maxMem = 0;
while (true) {
const { value, done } = await reader.read();
if (done) break;
const text = new TextDecoder().decode(value);
text.split('\n').forEach(line => {
try {
const obj = JSON.parse(line);
if (obj.server_memory_mb) {
maxMem = Math.max(maxMem, obj.server_memory_mb);
}
} catch { }
});
}
memory = maxMem;
} else {
const response = await fetch(api, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload)
});
const data = await response.json();
memory = data.server_peak_memory_mb;
}
time = Math.round(performance.now() - start);
peakMemory = Math.max(peakMemory, memory || 0);
totalTime += time;
logEl.textContent += `[${index + 1}/${chunks.length}] ✔ ${time}ms | Peak ${memory}MB\n`;
} catch (error) {
time = Math.round(performance.now() - start);
logEl.textContent += `[${index + 1}/${chunks.length}] ✖ ${time}ms | ${error.message}\n`;
}
completed += batch.length;
document.getElementById('stress-completed').textContent = completed;
document.getElementById('stress-peak-mem').textContent = peakMemory;
document.getElementById('stress-avg-time').textContent = Math.round(totalTime / (index + 1));
logEl.scrollTop = logEl.scrollHeight;
};
// Run with concurrency control
let active = 0;
let index = 0;
return new Promise(resolve => {
const runNext = () => {
while (active < concurrency && index < chunks.length) {
processBatch(chunks[index], index)
.finally(() => {
active--;
runNext();
});
active++;
index++;
}
if (active === 0 && index >= chunks.length) {
logEl.textContent += '\n✅ Stress test completed\n';
resolve();
}
};
runNext();
});
}
// Event listeners
document.getElementById('run-btn').addEventListener('click', runCrawl);
document.getElementById('st-run').addEventListener('click', runStressTest);
function forceHighlightElement(element) {
if (!element) return;
// Save current scroll position (important for large code blocks)
const scrollTop = element.parentElement.scrollTop;
// Reset the element
const text = element.textContent;
element.innerHTML = text;
element.removeAttribute('data-highlighted');
// Reapply highlighting
hljs.highlightElement(element);
// Restore scroll position
element.parentElement.scrollTop = scrollTop;
}
// Initialize clipboard for all copy buttons
function initCopyButtons() {
document.querySelectorAll('.copy-btn').forEach(btn => {
new ClipboardJS(btn, {
text: () => {
const target = document.querySelector(btn.dataset.target);
return target ? target.textContent : '';
}
}).on('success', e => {
e.clearSelection();
// make button text "copied" for 1 second
const originalText = e.trigger.textContent;
e.trigger.textContent = 'Copied!';
setTimeout(() => {
e.trigger.textContent = originalText;
}, 1000);
// Highlight the copied code
const target = document.querySelector(btn.dataset.target);
if (target) {
target.classList.add('highlighted');
setTimeout(() => {
target.classList.remove('highlighted');
}, 1000);
}
}).on('error', e => {
console.error('Error copying:', e);
});
});
}
// Call this in your DOMContentLoaded or initialization
initCopyButtons();
</script>
</body>
</html>

34
tests/memory/cap_test.py Normal file
View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python3
"""
Hammer /crawl with many concurrent requests to prove GLOBAL_SEM works.
"""
import asyncio, httpx, json, uuid, argparse
API = "http://localhost:8020/crawl"
URLS_PER_CALL = 1 # keep it minimal so each arun() == 1 page
CONCURRENT_CALLS = 20 # way above your cap
payload_template = {
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS", "verbose": False},
}
}
async def one_call(client):
payload = payload_template.copy()
payload["urls"] = [f"https://httpbin.org/anything/{uuid.uuid4()}"]
r = await client.post(API, json=payload)
r.raise_for_status()
return r.json()["server_peak_memory_mb"]
async def main():
async with httpx.AsyncClient(timeout=60) as client:
tasks = [asyncio.create_task(one_call(client)) for _ in range(CONCURRENT_CALLS)]
mem_usages = await asyncio.gather(*tasks)
print("Calls finished OK, server peaks reported:", mem_usages)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,35 @@
#!/usr/bin/env python3
"""
Quick sanitycheck for /config/dump endpoint.
Usage:
python test_config_dump.py [http://localhost:8020]
If the server isnt running, start it first:
uvicorn deploy.docker.server:app --port 8020
"""
import sys, json, textwrap, requests
BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
URL = f"{BASE.rstrip('/')}/config/dump"
CASES = [
# --- CrawlRunConfig variants ---
"CrawlerRunConfig()",
"CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)",
"CrawlerRunConfig(js_only=True, wait_until='networkidle')",
# --- BrowserConfig variants ---
"BrowserConfig()",
"BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
"BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')",
]
for code in CASES:
print("\n=== POST:", code)
resp = requests.post(URL, json={"code": code}, timeout=15)
if resp.ok:
print(json.dumps(resp.json(), indent=2)[:400] + "...")
else:
print("ERROR", resp.status_code, resp.text[:200])

View File

@@ -24,13 +24,13 @@ from rich.panel import Panel
from rich.syntax import Syntax
# --- Constants ---
# DEFAULT_API_URL = "http://localhost:11235" # Default port
DEFAULT_API_URL = "http://localhost:11235" # Default port
DEFAULT_API_URL = "http://localhost:8020" # Default port
DEFAULT_URL_COUNT = 1000
DEFAULT_MAX_CONCURRENT_REQUESTS = 5
DEFAULT_URL_COUNT = 100
DEFAULT_MAX_CONCURRENT_REQUESTS = 1
DEFAULT_CHUNK_SIZE = 10
DEFAULT_REPORT_PATH = "reports_api"
DEFAULT_STREAM_MODE = False
DEFAULT_STREAM_MODE = True
REQUEST_TIMEOUT = 180.0
# Initialize Rich console
@@ -77,6 +77,10 @@ class ApiStressTest:
self.report_path = pathlib.Path(report_path)
self.report_path.mkdir(parents=True, exist_ok=True)
self.stream_mode = stream_mode
# Ignore repo path and set it to current file path
self.repo_path = pathlib.Path(__file__).parent.resolve()
self.test_id = time.strftime("%Y%m%d_%H%M%S")
self.results_summary = {

View File

@@ -0,0 +1,203 @@
"""Lite Crawl4AI API stresstester.
✔ batch or stream mode (single unified path)
✔ global stats + JSON summary
✔ rich table progress
✔ Typer CLI with presets (quick / soak)
Usage examples:
python api_stress_test.py # uses quick preset
python api_stress_test.py soak # 5K URLs stress run
python api_stress_test.py --urls 200 --concurrent 10 --chunk 20
"""
from __future__ import annotations
import asyncio, json, time, uuid, pathlib, statistics
from typing import List, Dict, Optional
import httpx, typer
from rich.console import Console
from rich.table import Table
# ───────────────────────── defaults / presets ──────────────────────────
PRESETS = {
"quick": dict(urls=1, concurrent=1, chunk=1, stream=False),
"debug": dict(urls=10, concurrent=2, chunk=5, stream=False),
"soak": dict(urls=5000, concurrent=20, chunk=50, stream=True),
}
API_HEALTH_ENDPOINT = "/health"
REQUEST_TIMEOUT = 180.0
console = Console()
app = typer.Typer(add_completion=False, rich_markup_mode="rich")
# ───────────────────────── helpers ─────────────────────────────────────
async def _check_health(client: httpx.AsyncClient) -> None:
resp = await client.get(API_HEALTH_ENDPOINT, timeout=10)
resp.raise_for_status()
console.print(f"[green]Server healthy — version {resp.json().get('version','?')}[/]")
async def _iter_results(resp: httpx.Response, stream: bool):
"""Yield result dicts from batch JSON or NDJSON stream."""
if stream:
async for line in resp.aiter_lines():
if not line:
continue
rec = json.loads(line)
if rec.get("status") == "completed":
break
yield rec
else:
data = resp.json()
for rec in data.get("results", []):
yield rec, data # rec + whole payload for memory delta/peak
async def _consume_stream(resp: httpx.Response) -> Dict:
stats = {"success_urls": 0, "failed_urls": 0, "mem_metric": 0.0}
async for line in resp.aiter_lines():
if not line:
continue
rec = json.loads(line)
if rec.get("status") == "completed":
break
if rec.get("success"):
stats["success_urls"] += 1
else:
stats["failed_urls"] += 1
mem = rec.get("server_memory_mb")
if mem is not None:
stats["mem_metric"] = max(stats["mem_metric"], float(mem))
return stats
def _consume_batch(body: Dict) -> Dict:
stats = {"success_urls": 0, "failed_urls": 0}
for rec in body.get("results", []):
if rec.get("success"):
stats["success_urls"] += 1
else:
stats["failed_urls"] += 1
stats["mem_metric"] = body.get("server_memory_delta_mb")
stats["peak"] = body.get("server_peak_memory_mb")
return stats
async def _fetch_chunk(
client: httpx.AsyncClient,
urls: List[str],
stream: bool,
semaphore: asyncio.Semaphore,
) -> Dict:
endpoint = "/crawl/stream" if stream else "/crawl"
payload = {
"urls": urls,
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS", "stream": stream}},
}
async with semaphore:
start = time.perf_counter()
if stream:
# ---- streaming request ----
async with client.stream("POST", endpoint, json=payload) as resp:
resp.raise_for_status()
stats = await _consume_stream(resp)
else:
# ---- batch request ----
resp = await client.post(endpoint, json=payload)
resp.raise_for_status()
stats = _consume_batch(resp.json())
stats["elapsed"] = time.perf_counter() - start
return stats
# ───────────────────────── core runner ─────────────────────────────────
async def _run(api: str, urls: int, concurrent: int, chunk: int, stream: bool, report: pathlib.Path):
client = httpx.AsyncClient(base_url=api, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=concurrent+5))
await _check_health(client)
url_list = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(urls)]
chunks = [url_list[i:i+chunk] for i in range(0, len(url_list), chunk)]
sem = asyncio.Semaphore(concurrent)
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Batch", style="dim", width=6)
table.add_column("Success/Fail", width=12)
table.add_column("Mem", width=14)
table.add_column("Time (s)")
agg_success = agg_fail = 0
deltas, peaks = [], []
start = time.perf_counter()
tasks = [asyncio.create_task(_fetch_chunk(client, c, stream, sem)) for c in chunks]
for idx, coro in enumerate(asyncio.as_completed(tasks), 1):
res = await coro
agg_success += res["success_urls"]
agg_fail += res["failed_urls"]
if res["mem_metric"] is not None:
deltas.append(res["mem_metric"])
if res["peak"] is not None:
peaks.append(res["peak"])
mem_txt = f"{res['mem_metric']:.1f}" if res["mem_metric"] is not None else ""
if res["peak"] is not None:
mem_txt = f"{res['peak']:.1f}/{mem_txt}"
table.add_row(str(idx), f"{res['success_urls']}/{res['failed_urls']}", mem_txt, f"{res['elapsed']:.2f}")
console.print(table)
total_time = time.perf_counter() - start
summary = {
"urls": urls,
"concurrent": concurrent,
"chunk": chunk,
"stream": stream,
"success_urls": agg_success,
"failed_urls": agg_fail,
"elapsed_sec": round(total_time, 2),
"avg_mem": round(statistics.mean(deltas), 2) if deltas else None,
"max_mem": max(deltas) if deltas else None,
"avg_peak": round(statistics.mean(peaks), 2) if peaks else None,
"max_peak": max(peaks) if peaks else None,
}
console.print("\n[bold green]Done:[/]" , summary)
report.mkdir(parents=True, exist_ok=True)
path = report / f"api_test_{int(time.time())}.json"
path.write_text(json.dumps(summary, indent=2))
console.print(f"[green]Summary → {path}")
await client.aclose()
# ───────────────────────── Typer CLI ──────────────────────────────────
@app.command()
def main(
preset: str = typer.Argument("quick", help="quick / debug / soak or custom"),
api_url: str = typer.Option("http://localhost:8020", show_default=True),
urls: int = typer.Option(None, help="Total URLs to crawl"),
concurrent: int = typer.Option(None, help="Concurrent API requests"),
chunk: int = typer.Option(None, help="URLs per request"),
stream: bool = typer.Option(None, help="Use /crawl/stream"),
report: pathlib.Path = typer.Option("reports_api", help="Where to save JSON summary"),
):
"""Run a stress test against a running Crawl4AI API server."""
if preset not in PRESETS and any(v is None for v in (urls, concurrent, chunk, stream)):
console.print(f"[red]Unknown preset '{preset}' and custom params missing[/]")
raise typer.Exit(1)
cfg = PRESETS.get(preset, {})
urls = urls or cfg.get("urls")
concurrent = concurrent or cfg.get("concurrent")
chunk = chunk or cfg.get("chunk")
stream = stream if stream is not None else cfg.get("stream", False)
console.print(f"[cyan]API:[/] {api_url} | URLs: {urls} | Concurrency: {concurrent} | Chunk: {chunk} | Stream: {stream}")
asyncio.run(_run(api_url, urls, concurrent, chunk, stream, report))
if __name__ == "__main__":
app()