Replace crawler_manager.py with simpler crawler_pool.py implementation: - Add global page semaphore for hard concurrency cap - Implement browser pool with idle cleanup - Add playground UI for testing and stress testing - Update API handlers to use pooled crawlers - Enhance logging levels and symbols BREAKING CHANGE: Removes CrawlerManager class in favor of simpler pool-based approach
35 lines
1.1 KiB
Python
35 lines
1.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Hammer /crawl with many concurrent requests to prove GLOBAL_SEM works.
|
|
"""
|
|
|
|
import asyncio, httpx, json, uuid, argparse
|
|
|
|
API = "http://localhost:8020/crawl"
|
|
URLS_PER_CALL = 1 # keep it minimal so each arun() == 1 page
|
|
CONCURRENT_CALLS = 20 # way above your cap
|
|
|
|
payload_template = {
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
"crawler_config": {
|
|
"type": "CrawlerRunConfig",
|
|
"params": {"cache_mode": "BYPASS", "verbose": False},
|
|
}
|
|
}
|
|
|
|
async def one_call(client):
|
|
payload = payload_template.copy()
|
|
payload["urls"] = [f"https://httpbin.org/anything/{uuid.uuid4()}"]
|
|
r = await client.post(API, json=payload)
|
|
r.raise_for_status()
|
|
return r.json()["server_peak_memory_mb"]
|
|
|
|
async def main():
|
|
async with httpx.AsyncClient(timeout=60) as client:
|
|
tasks = [asyncio.create_task(one_call(client)) for _ in range(CONCURRENT_CALLS)]
|
|
mem_usages = await asyncio.gather(*tasks)
|
|
print("Calls finished OK, server peaks reported:", mem_usages)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|