feat: enhance crawling functionality with anti-bot strategies and headless mode options (Browser adapters , 12.Undetected/stealth browser)

This commit is contained in:
AHMET YILMAZ
2025-10-03 18:02:10 +08:00
parent a599db8f7b
commit 5dc34dd210
4 changed files with 421 additions and 257 deletions

View File

@@ -1,10 +1,27 @@
# crawler_pool.py (new file)
import asyncio, json, hashlib, time, psutil
import asyncio
import hashlib
import json
import time
from contextlib import suppress
from typing import Dict
from typing import Dict, Optional
import psutil
from crawl4ai import AsyncWebCrawler, BrowserConfig
from typing import Dict
from utils import load_config
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
# Import browser adapters with fallback
try:
from crawl4ai.browser_adapter import BrowserAdapter, PlaywrightAdapter
except ImportError:
# Fallback for development environment
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from crawl4ai.browser_adapter import BrowserAdapter, PlaywrightAdapter
from utils import load_config
CONFIG = load_config()
@@ -12,25 +29,44 @@ POOL: Dict[str, AsyncWebCrawler] = {}
LAST_USED: Dict[str, float] = {}
LOCK = asyncio.Lock()
MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0) # % RAM refuse new browsers above this
IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800) # close if unused for 30min
MEM_LIMIT = CONFIG.get("crawler", {}).get(
"memory_threshold_percent", 95.0
) # % RAM refuse new browsers above this
IDLE_TTL = (
CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800)
) # close if unused for 30min
def _sig(cfg: BrowserConfig) -> str:
payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
def _sig(cfg: BrowserConfig, adapter: Optional[BrowserAdapter] = None) -> str:
config_payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",", ":"))
adapter_name = adapter.__class__.__name__ if adapter else "PlaywrightAdapter"
payload = f"{config_payload}:{adapter_name}"
return hashlib.sha1(payload.encode()).hexdigest()
async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
async def get_crawler(
cfg: BrowserConfig, adapter: Optional[BrowserAdapter] = None
) -> AsyncWebCrawler:
try:
sig = _sig(cfg)
sig = _sig(cfg, adapter)
async with LOCK:
if sig in POOL:
LAST_USED[sig] = time.time();
LAST_USED[sig] = time.time()
return POOL[sig]
if psutil.virtual_memory().percent >= MEM_LIMIT:
raise MemoryError("RAM pressure new browser denied")
crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
# Create strategy with the specified adapter
strategy = AsyncPlaywrightCrawlerStrategy(
browser_config=cfg, browser_adapter=adapter or PlaywrightAdapter()
)
crawler = AsyncWebCrawler(
config=cfg, crawler_strategy=strategy, thread_safe=False
)
await crawler.start()
POOL[sig] = crawler; LAST_USED[sig] = time.time()
POOL[sig] = crawler
LAST_USED[sig] = time.time()
return crawler
except MemoryError as e:
raise MemoryError(f"RAM pressure new browser denied: {e}")
@@ -44,10 +80,16 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
POOL.pop(sig, None)
LAST_USED.pop(sig, None)
# If we failed to start the browser, we should remove it from the pool
async def close_all():
async with LOCK:
await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
POOL.clear(); LAST_USED.clear()
await asyncio.gather(
*(c.close() for c in POOL.values()), return_exceptions=True
)
POOL.clear()
LAST_USED.clear()
async def janitor():
while True:
@@ -56,5 +98,7 @@ async def janitor():
async with LOCK:
for sig, crawler in list(POOL.items()):
if now - LAST_USED[sig] > IDLE_TTL:
with suppress(Exception): await crawler.close()
POOL.pop(sig, None); LAST_USED.pop(sig, None)
with suppress(Exception):
await crawler.close()
POOL.pop(sig, None)
LAST_USED.pop(sig, None)