Files
crawl4ai/deploy/docker/crawler_pool.py

105 lines
3.3 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# crawler_pool.py (new file)
import asyncio
import hashlib
import json
import time
from contextlib import suppress
from typing import Dict, Optional
import psutil
from crawl4ai import AsyncWebCrawler, BrowserConfig
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Import browser adapters with fallback
try:
from crawl4ai.browser_adapter import BrowserAdapter, PlaywrightAdapter
except ImportError:
# Fallback for development environment
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from crawl4ai.browser_adapter import BrowserAdapter, PlaywrightAdapter
from utils import load_config
CONFIG = load_config()
POOL: Dict[str, AsyncWebCrawler] = {}
LAST_USED: Dict[str, float] = {}
LOCK = asyncio.Lock()
MEM_LIMIT = CONFIG.get("crawler", {}).get(
"memory_threshold_percent", 95.0
) # % RAM refuse new browsers above this
IDLE_TTL = (
CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800)
) # close if unused for 30min
def _sig(cfg: BrowserConfig, adapter: Optional[BrowserAdapter] = None) -> str:
config_payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",", ":"))
adapter_name = adapter.__class__.__name__ if adapter else "PlaywrightAdapter"
payload = f"{config_payload}:{adapter_name}"
return hashlib.sha1(payload.encode()).hexdigest()
async def get_crawler(
cfg: BrowserConfig, adapter: Optional[BrowserAdapter] = None
) -> AsyncWebCrawler:
try:
sig = _sig(cfg, adapter)
async with LOCK:
if sig in POOL:
LAST_USED[sig] = time.time()
return POOL[sig]
if psutil.virtual_memory().percent >= MEM_LIMIT:
raise MemoryError("RAM pressure new browser denied")
# Create strategy with the specified adapter
strategy = AsyncPlaywrightCrawlerStrategy(
browser_config=cfg, browser_adapter=adapter or PlaywrightAdapter()
)
crawler = AsyncWebCrawler(
config=cfg, crawler_strategy=strategy, thread_safe=False
)
await crawler.start()
POOL[sig] = crawler
LAST_USED[sig] = time.time()
return crawler
except MemoryError as e:
raise MemoryError(f"RAM pressure new browser denied: {e}")
except Exception as e:
raise RuntimeError(f"Failed to start browser: {e}")
finally:
if sig in POOL:
LAST_USED[sig] = time.time()
else:
# If we failed to start the browser, we should remove it from the pool
POOL.pop(sig, None)
LAST_USED.pop(sig, None)
# If we failed to start the browser, we should remove it from the pool
async def close_all():
async with LOCK:
await asyncio.gather(
*(c.close() for c in POOL.values()), return_exceptions=True
)
POOL.clear()
LAST_USED.clear()
async def janitor():
while True:
await asyncio.sleep(60)
now = time.time()
async with LOCK:
for sig, crawler in list(POOL.items()):
if now - LAST_USED[sig] > IDLE_TTL:
with suppress(Exception):
await crawler.close()
POOL.pop(sig, None)
LAST_USED.pop(sig, None)