feat: enhance crawling functionality with anti-bot strategies and headless mode options (Browser adapters , 12.Undetected/stealth browser)
This commit is contained in:
@@ -1,10 +1,27 @@
|
||||
# crawler_pool.py (new file)
|
||||
import asyncio, json, hashlib, time, psutil
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import time
|
||||
from contextlib import suppress
|
||||
from typing import Dict
|
||||
from typing import Dict, Optional
|
||||
|
||||
import psutil
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
from typing import Dict
|
||||
from utils import load_config
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Import browser adapters with fallback
|
||||
try:
|
||||
from crawl4ai.browser_adapter import BrowserAdapter, PlaywrightAdapter
|
||||
except ImportError:
|
||||
# Fallback for development environment
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
from crawl4ai.browser_adapter import BrowserAdapter, PlaywrightAdapter
|
||||
from utils import load_config
|
||||
|
||||
CONFIG = load_config()
|
||||
|
||||
@@ -12,25 +29,44 @@ POOL: Dict[str, AsyncWebCrawler] = {}
|
||||
LAST_USED: Dict[str, float] = {}
|
||||
LOCK = asyncio.Lock()
|
||||
|
||||
MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0) # % RAM – refuse new browsers above this
|
||||
IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800) # close if unused for 30 min
|
||||
MEM_LIMIT = CONFIG.get("crawler", {}).get(
|
||||
"memory_threshold_percent", 95.0
|
||||
) # % RAM – refuse new browsers above this
|
||||
IDLE_TTL = (
|
||||
CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800)
|
||||
) # close if unused for 30 min
|
||||
|
||||
def _sig(cfg: BrowserConfig) -> str:
|
||||
payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
|
||||
|
||||
def _sig(cfg: BrowserConfig, adapter: Optional[BrowserAdapter] = None) -> str:
|
||||
config_payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",", ":"))
|
||||
adapter_name = adapter.__class__.__name__ if adapter else "PlaywrightAdapter"
|
||||
payload = f"{config_payload}:{adapter_name}"
|
||||
return hashlib.sha1(payload.encode()).hexdigest()
|
||||
|
||||
async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
|
||||
|
||||
async def get_crawler(
|
||||
cfg: BrowserConfig, adapter: Optional[BrowserAdapter] = None
|
||||
) -> AsyncWebCrawler:
|
||||
try:
|
||||
sig = _sig(cfg)
|
||||
sig = _sig(cfg, adapter)
|
||||
async with LOCK:
|
||||
if sig in POOL:
|
||||
LAST_USED[sig] = time.time();
|
||||
LAST_USED[sig] = time.time()
|
||||
return POOL[sig]
|
||||
if psutil.virtual_memory().percent >= MEM_LIMIT:
|
||||
raise MemoryError("RAM pressure – new browser denied")
|
||||
crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
|
||||
|
||||
# Create strategy with the specified adapter
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=cfg, browser_adapter=adapter or PlaywrightAdapter()
|
||||
)
|
||||
|
||||
crawler = AsyncWebCrawler(
|
||||
config=cfg, crawler_strategy=strategy, thread_safe=False
|
||||
)
|
||||
await crawler.start()
|
||||
POOL[sig] = crawler; LAST_USED[sig] = time.time()
|
||||
POOL[sig] = crawler
|
||||
LAST_USED[sig] = time.time()
|
||||
return crawler
|
||||
except MemoryError as e:
|
||||
raise MemoryError(f"RAM pressure – new browser denied: {e}")
|
||||
@@ -44,10 +80,16 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
|
||||
POOL.pop(sig, None)
|
||||
LAST_USED.pop(sig, None)
|
||||
# If we failed to start the browser, we should remove it from the pool
|
||||
|
||||
|
||||
async def close_all():
|
||||
async with LOCK:
|
||||
await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
|
||||
POOL.clear(); LAST_USED.clear()
|
||||
await asyncio.gather(
|
||||
*(c.close() for c in POOL.values()), return_exceptions=True
|
||||
)
|
||||
POOL.clear()
|
||||
LAST_USED.clear()
|
||||
|
||||
|
||||
async def janitor():
|
||||
while True:
|
||||
@@ -56,5 +98,7 @@ async def janitor():
|
||||
async with LOCK:
|
||||
for sig, crawler in list(POOL.items()):
|
||||
if now - LAST_USED[sig] > IDLE_TTL:
|
||||
with suppress(Exception): await crawler.close()
|
||||
POOL.pop(sig, None); LAST_USED.pop(sig, None)
|
||||
with suppress(Exception):
|
||||
await crawler.close()
|
||||
POOL.pop(sig, None)
|
||||
LAST_USED.pop(sig, None)
|
||||
|
||||
Reference in New Issue
Block a user