Adds a new CrawlerManager class to handle browser instance pooling and failover: - Implements auto-scaling based on system resources - Adds primary/backup crawler management - Integrates memory monitoring and throttling - Adds streaming support with memory tracking - Updates API endpoints to use pooled crawlers BREAKING CHANGE: API endpoints now require CrawlerManager initialization
130 lines
4.9 KiB
Python
130 lines
4.9 KiB
Python
"""
|
||
Crawl4AI Docker API stress tester.
|
||
|
||
Examples
|
||
--------
|
||
python test_stress_docker_api.py --urls 1000 --concurrency 32
|
||
python test_stress_docker_api.py --urls 1000 --concurrency 32 --stream
|
||
python test_stress_docker_api.py --base-url http://10.0.0.42:11235 --http2
|
||
"""
|
||
|
||
import argparse, asyncio, json, secrets, statistics, time
|
||
from typing import List, Tuple
|
||
import httpx
|
||
from rich.console import Console
|
||
from rich.progress import Progress, BarColumn, TimeElapsedColumn, TimeRemainingColumn
|
||
from rich.table import Table
|
||
|
||
console = Console()
|
||
|
||
|
||
# ───────────────────────── helpers ─────────────────────────
|
||
def make_fake_urls(n: int) -> List[str]:
|
||
base = "https://httpbin.org/anything/"
|
||
return [f"{base}{secrets.token_hex(8)}" for _ in range(n)]
|
||
|
||
|
||
async def fire(
|
||
client: httpx.AsyncClient, endpoint: str, payload: dict, sem: asyncio.Semaphore
|
||
) -> Tuple[bool, float]:
|
||
async with sem:
|
||
print(f"POST {endpoint} with {len(payload['urls'])} URLs")
|
||
t0 = time.perf_counter()
|
||
try:
|
||
if endpoint.endswith("/stream"):
|
||
async with client.stream("POST", endpoint, json=payload) as r:
|
||
r.raise_for_status()
|
||
async for _ in r.aiter_lines():
|
||
pass
|
||
else:
|
||
r = await client.post(endpoint, json=payload)
|
||
r.raise_for_status()
|
||
return True, time.perf_counter() - t0
|
||
except Exception:
|
||
return False, time.perf_counter() - t0
|
||
|
||
|
||
def pct(lat: List[float], p: float) -> str:
|
||
"""Return percentile string even for tiny samples."""
|
||
if not lat:
|
||
return "-"
|
||
if len(lat) == 1:
|
||
return f"{lat[0]:.2f}s"
|
||
lat_sorted = sorted(lat)
|
||
k = (p / 100) * (len(lat_sorted) - 1)
|
||
lo = int(k)
|
||
hi = min(lo + 1, len(lat_sorted) - 1)
|
||
frac = k - lo
|
||
val = lat_sorted[lo] * (1 - frac) + lat_sorted[hi] * frac
|
||
return f"{val:.2f}s"
|
||
|
||
|
||
# ───────────────────────── main ─────────────────────────
|
||
def parse_args() -> argparse.Namespace:
|
||
p = argparse.ArgumentParser(description="Stress test Crawl4AI Docker API")
|
||
p.add_argument("--urls", type=int, default=100, help="number of URLs")
|
||
p.add_argument("--concurrency", type=int, default=1, help="max POSTs in flight")
|
||
p.add_argument("--chunk-size", type=int, default=50, help="URLs per request")
|
||
p.add_argument("--base-url", default="http://localhost:11235", help="API root")
|
||
# p.add_argument("--base-url", default="http://localhost:8020", help="API root")
|
||
p.add_argument("--stream", action="store_true", help="use /crawl/stream")
|
||
p.add_argument("--http2", action="store_true", help="enable HTTP/2")
|
||
p.add_argument("--headless", action="store_true", default=True)
|
||
return p.parse_args()
|
||
|
||
|
||
async def main() -> None:
|
||
args = parse_args()
|
||
|
||
urls = make_fake_urls(args.urls)
|
||
batches = [urls[i : i + args.chunk_size] for i in range(0, len(urls), args.chunk_size)]
|
||
endpoint = "/crawl/stream" if args.stream else "/crawl"
|
||
sem = asyncio.Semaphore(args.concurrency)
|
||
|
||
async with httpx.AsyncClient(base_url=args.base_url, http2=args.http2, timeout=None) as client:
|
||
with Progress(
|
||
"[progress.description]{task.description}",
|
||
BarColumn(),
|
||
"[progress.percentage]{task.percentage:>3.0f}%",
|
||
TimeElapsedColumn(),
|
||
TimeRemainingColumn(),
|
||
) as progress:
|
||
task_id = progress.add_task("[cyan]bombarding…", total=len(batches))
|
||
tasks = []
|
||
for chunk in batches:
|
||
payload = {
|
||
"urls": chunk,
|
||
"browser_config": {"type": "BrowserConfig", "params": {"headless": args.headless}},
|
||
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS", "stream": args.stream}},
|
||
}
|
||
tasks.append(asyncio.create_task(fire(client, endpoint, payload, sem)))
|
||
progress.advance(task_id)
|
||
|
||
results = await asyncio.gather(*tasks)
|
||
|
||
ok_latencies = [dt for ok, dt in results if ok]
|
||
err_count = sum(1 for ok, _ in results if not ok)
|
||
|
||
table = Table(title="Docker API Stress‑Test Summary")
|
||
table.add_column("total", justify="right")
|
||
table.add_column("errors", justify="right")
|
||
table.add_column("p50", justify="right")
|
||
table.add_column("p95", justify="right")
|
||
table.add_column("max", justify="right")
|
||
|
||
table.add_row(
|
||
str(len(results)),
|
||
str(err_count),
|
||
pct(ok_latencies, 50),
|
||
pct(ok_latencies, 95),
|
||
f"{max(ok_latencies):.2f}s" if ok_latencies else "-",
|
||
)
|
||
console.print(table)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
try:
|
||
asyncio.run(main())
|
||
except KeyboardInterrupt:
|
||
console.print("\n[yellow]aborted by user[/]")
|