refactor(server): migrate to pool-based crawler management

Replace crawler_manager.py with simpler crawler_pool.py implementation:
- Add global page semaphore for hard concurrency cap
- Implement browser pool with idle cleanup
- Add playground UI for testing and stress testing
- Update API handlers to use pooled crawlers
- Enhance logging levels and symbols

BREAKING CHANGE: Removes CrawlerManager class in favor of simpler pool-based approach
This commit is contained in:
UncleCode
2025-04-20 20:14:26 +08:00
parent 16b2318242
commit a58c8000aa
14 changed files with 1447 additions and 1435 deletions

34
tests/memory/cap_test.py Normal file
View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python3
"""
Hammer /crawl with many concurrent requests to prove GLOBAL_SEM works.
"""
import asyncio, httpx, json, uuid, argparse
API = "http://localhost:8020/crawl"
URLS_PER_CALL = 1 # keep it minimal so each arun() == 1 page
CONCURRENT_CALLS = 20 # way above your cap
payload_template = {
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS", "verbose": False},
}
}
async def one_call(client):
payload = payload_template.copy()
payload["urls"] = [f"https://httpbin.org/anything/{uuid.uuid4()}"]
r = await client.post(API, json=payload)
r.raise_for_status()
return r.json()["server_peak_memory_mb"]
async def main():
async with httpx.AsyncClient(timeout=60) as client:
tasks = [asyncio.create_task(one_call(client)) for _ in range(CONCURRENT_CALLS)]
mem_usages = await asyncio.gather(*tasks)
print("Calls finished OK, server peaks reported:", mem_usages)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,35 @@
#!/usr/bin/env python3
"""
Quick sanitycheck for /config/dump endpoint.
Usage:
python test_config_dump.py [http://localhost:8020]
If the server isnt running, start it first:
uvicorn deploy.docker.server:app --port 8020
"""
import sys, json, textwrap, requests
BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
URL = f"{BASE.rstrip('/')}/config/dump"
CASES = [
# --- CrawlRunConfig variants ---
"CrawlerRunConfig()",
"CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)",
"CrawlerRunConfig(js_only=True, wait_until='networkidle')",
# --- BrowserConfig variants ---
"BrowserConfig()",
"BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
"BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')",
]
for code in CASES:
print("\n=== POST:", code)
resp = requests.post(URL, json={"code": code}, timeout=15)
if resp.ok:
print(json.dumps(resp.json(), indent=2)[:400] + "...")
else:
print("ERROR", resp.status_code, resp.text[:200])

View File

@@ -24,13 +24,13 @@ from rich.panel import Panel
from rich.syntax import Syntax
# --- Constants ---
# DEFAULT_API_URL = "http://localhost:11235" # Default port
DEFAULT_API_URL = "http://localhost:11235" # Default port
DEFAULT_API_URL = "http://localhost:8020" # Default port
DEFAULT_URL_COUNT = 1000
DEFAULT_MAX_CONCURRENT_REQUESTS = 5
DEFAULT_URL_COUNT = 100
DEFAULT_MAX_CONCURRENT_REQUESTS = 1
DEFAULT_CHUNK_SIZE = 10
DEFAULT_REPORT_PATH = "reports_api"
DEFAULT_STREAM_MODE = False
DEFAULT_STREAM_MODE = True
REQUEST_TIMEOUT = 180.0
# Initialize Rich console
@@ -77,6 +77,10 @@ class ApiStressTest:
self.report_path = pathlib.Path(report_path)
self.report_path.mkdir(parents=True, exist_ok=True)
self.stream_mode = stream_mode
# Ignore repo path and set it to current file path
self.repo_path = pathlib.Path(__file__).parent.resolve()
self.test_id = time.strftime("%Y%m%d_%H%M%S")
self.results_summary = {

View File

@@ -0,0 +1,203 @@
"""Lite Crawl4AI API stresstester.
✔ batch or stream mode (single unified path)
✔ global stats + JSON summary
✔ rich table progress
✔ Typer CLI with presets (quick / soak)
Usage examples:
python api_stress_test.py # uses quick preset
python api_stress_test.py soak # 5K URLs stress run
python api_stress_test.py --urls 200 --concurrent 10 --chunk 20
"""
from __future__ import annotations
import asyncio, json, time, uuid, pathlib, statistics
from typing import List, Dict, Optional
import httpx, typer
from rich.console import Console
from rich.table import Table
# ───────────────────────── defaults / presets ──────────────────────────
PRESETS = {
"quick": dict(urls=1, concurrent=1, chunk=1, stream=False),
"debug": dict(urls=10, concurrent=2, chunk=5, stream=False),
"soak": dict(urls=5000, concurrent=20, chunk=50, stream=True),
}
API_HEALTH_ENDPOINT = "/health"
REQUEST_TIMEOUT = 180.0
console = Console()
app = typer.Typer(add_completion=False, rich_markup_mode="rich")
# ───────────────────────── helpers ─────────────────────────────────────
async def _check_health(client: httpx.AsyncClient) -> None:
resp = await client.get(API_HEALTH_ENDPOINT, timeout=10)
resp.raise_for_status()
console.print(f"[green]Server healthy — version {resp.json().get('version','?')}[/]")
async def _iter_results(resp: httpx.Response, stream: bool):
"""Yield result dicts from batch JSON or NDJSON stream."""
if stream:
async for line in resp.aiter_lines():
if not line:
continue
rec = json.loads(line)
if rec.get("status") == "completed":
break
yield rec
else:
data = resp.json()
for rec in data.get("results", []):
yield rec, data # rec + whole payload for memory delta/peak
async def _consume_stream(resp: httpx.Response) -> Dict:
stats = {"success_urls": 0, "failed_urls": 0, "mem_metric": 0.0}
async for line in resp.aiter_lines():
if not line:
continue
rec = json.loads(line)
if rec.get("status") == "completed":
break
if rec.get("success"):
stats["success_urls"] += 1
else:
stats["failed_urls"] += 1
mem = rec.get("server_memory_mb")
if mem is not None:
stats["mem_metric"] = max(stats["mem_metric"], float(mem))
return stats
def _consume_batch(body: Dict) -> Dict:
stats = {"success_urls": 0, "failed_urls": 0}
for rec in body.get("results", []):
if rec.get("success"):
stats["success_urls"] += 1
else:
stats["failed_urls"] += 1
stats["mem_metric"] = body.get("server_memory_delta_mb")
stats["peak"] = body.get("server_peak_memory_mb")
return stats
async def _fetch_chunk(
client: httpx.AsyncClient,
urls: List[str],
stream: bool,
semaphore: asyncio.Semaphore,
) -> Dict:
endpoint = "/crawl/stream" if stream else "/crawl"
payload = {
"urls": urls,
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig",
"params": {"cache_mode": "BYPASS", "stream": stream}},
}
async with semaphore:
start = time.perf_counter()
if stream:
# ---- streaming request ----
async with client.stream("POST", endpoint, json=payload) as resp:
resp.raise_for_status()
stats = await _consume_stream(resp)
else:
# ---- batch request ----
resp = await client.post(endpoint, json=payload)
resp.raise_for_status()
stats = _consume_batch(resp.json())
stats["elapsed"] = time.perf_counter() - start
return stats
# ───────────────────────── core runner ─────────────────────────────────
async def _run(api: str, urls: int, concurrent: int, chunk: int, stream: bool, report: pathlib.Path):
client = httpx.AsyncClient(base_url=api, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=concurrent+5))
await _check_health(client)
url_list = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(urls)]
chunks = [url_list[i:i+chunk] for i in range(0, len(url_list), chunk)]
sem = asyncio.Semaphore(concurrent)
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Batch", style="dim", width=6)
table.add_column("Success/Fail", width=12)
table.add_column("Mem", width=14)
table.add_column("Time (s)")
agg_success = agg_fail = 0
deltas, peaks = [], []
start = time.perf_counter()
tasks = [asyncio.create_task(_fetch_chunk(client, c, stream, sem)) for c in chunks]
for idx, coro in enumerate(asyncio.as_completed(tasks), 1):
res = await coro
agg_success += res["success_urls"]
agg_fail += res["failed_urls"]
if res["mem_metric"] is not None:
deltas.append(res["mem_metric"])
if res["peak"] is not None:
peaks.append(res["peak"])
mem_txt = f"{res['mem_metric']:.1f}" if res["mem_metric"] is not None else ""
if res["peak"] is not None:
mem_txt = f"{res['peak']:.1f}/{mem_txt}"
table.add_row(str(idx), f"{res['success_urls']}/{res['failed_urls']}", mem_txt, f"{res['elapsed']:.2f}")
console.print(table)
total_time = time.perf_counter() - start
summary = {
"urls": urls,
"concurrent": concurrent,
"chunk": chunk,
"stream": stream,
"success_urls": agg_success,
"failed_urls": agg_fail,
"elapsed_sec": round(total_time, 2),
"avg_mem": round(statistics.mean(deltas), 2) if deltas else None,
"max_mem": max(deltas) if deltas else None,
"avg_peak": round(statistics.mean(peaks), 2) if peaks else None,
"max_peak": max(peaks) if peaks else None,
}
console.print("\n[bold green]Done:[/]" , summary)
report.mkdir(parents=True, exist_ok=True)
path = report / f"api_test_{int(time.time())}.json"
path.write_text(json.dumps(summary, indent=2))
console.print(f"[green]Summary → {path}")
await client.aclose()
# ───────────────────────── Typer CLI ──────────────────────────────────
@app.command()
def main(
preset: str = typer.Argument("quick", help="quick / debug / soak or custom"),
api_url: str = typer.Option("http://localhost:8020", show_default=True),
urls: int = typer.Option(None, help="Total URLs to crawl"),
concurrent: int = typer.Option(None, help="Concurrent API requests"),
chunk: int = typer.Option(None, help="URLs per request"),
stream: bool = typer.Option(None, help="Use /crawl/stream"),
report: pathlib.Path = typer.Option("reports_api", help="Where to save JSON summary"),
):
"""Run a stress test against a running Crawl4AI API server."""
if preset not in PRESETS and any(v is None for v in (urls, concurrent, chunk, stream)):
console.print(f"[red]Unknown preset '{preset}' and custom params missing[/]")
raise typer.Exit(1)
cfg = PRESETS.get(preset, {})
urls = urls or cfg.get("urls")
concurrent = concurrent or cfg.get("concurrent")
chunk = chunk or cfg.get("chunk")
stream = stream if stream is not None else cfg.get("stream", False)
console.print(f"[cyan]API:[/] {api_url} | URLs: {urls} | Concurrency: {concurrent} | Chunk: {chunk} | Stream: {stream}")
asyncio.run(_run(api_url, urls, concurrent, chunk, stream, report))
if __name__ == "__main__":
app()