Files
crawl4ai/deploy/docker/crawler_pool.py

120 lines
4.0 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# crawler_pool.py (new file)
import asyncio
import hashlib
import json
import time
from contextlib import suppress
from typing import Dict, Optional
import psutil
from crawl4ai import AsyncWebCrawler, BrowserConfig
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Import browser adapters with fallback
try:
from crawl4ai.browser_adapter import BrowserAdapter, PlaywrightAdapter
except ImportError:
# Fallback for development environment
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from crawl4ai.browser_adapter import BrowserAdapter, PlaywrightAdapter
from utils import load_config
CONFIG = load_config()
POOL: Dict[str, AsyncWebCrawler] = {}
LAST_USED: Dict[str, float] = {}
LOCK = asyncio.Lock()
MEM_LIMIT = CONFIG.get("crawler", {}).get(
"memory_threshold_percent", 95.0
) # % RAM refuse new browsers above this
IDLE_TTL = (
CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800)
) # close if unused for 30min
def _sig(cfg: BrowserConfig, adapter: Optional[BrowserAdapter] = None) -> str:
try:
config_payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",", ":"))
except (TypeError, ValueError):
# Fallback to string representation if JSON serialization fails
config_payload = str(cfg.to_dict())
adapter_name = adapter.__class__.__name__ if adapter else "PlaywrightAdapter"
payload = f"{config_payload}:{adapter_name}"
return hashlib.sha1(payload.encode()).hexdigest()
async def get_crawler(
cfg: BrowserConfig, adapter: Optional[BrowserAdapter] = None
) -> AsyncWebCrawler:
sig = None
try:
sig = _sig(cfg, adapter)
async with LOCK:
if sig in POOL:
LAST_USED[sig] = time.time()
return POOL[sig]
if psutil.virtual_memory().percent >= MEM_LIMIT:
raise MemoryError("RAM pressure new browser denied")
# Create crawler - let it initialize the strategy with proper logger
# Pass browser_adapter as a kwarg so AsyncWebCrawler can use it when creating the strategy
crawler = AsyncWebCrawler(
config=cfg,
thread_safe=False
)
# Set the browser adapter on the strategy after crawler initialization
if adapter:
# Create a new strategy with the adapter and the crawler's logger
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
crawler.crawler_strategy = AsyncPlaywrightCrawlerStrategy(
browser_config=cfg,
logger=crawler.logger,
browser_adapter=adapter
)
await crawler.start()
POOL[sig] = crawler
LAST_USED[sig] = time.time()
return crawler
except MemoryError as e:
raise MemoryError(f"RAM pressure new browser denied: {e}")
except Exception as e:
raise RuntimeError(f"Failed to start browser: {e}")
finally:
if sig:
if sig in POOL:
LAST_USED[sig] = time.time()
else:
# If we failed to start the browser, we should remove it from the pool
POOL.pop(sig, None)
LAST_USED.pop(sig, None)
# If we failed to start the browser, we should remove it from the pool
async def close_all():
async with LOCK:
await asyncio.gather(
*(c.close() for c in POOL.values()), return_exceptions=True
)
POOL.clear()
LAST_USED.clear()
async def janitor():
while True:
await asyncio.sleep(60)
now = time.time()
async with LOCK:
for sig, crawler in list(POOL.items()):
if now - LAST_USED[sig] > IDLE_TTL:
with suppress(Exception):
await crawler.close()
POOL.pop(sig, None)
LAST_USED.pop(sig, None)