| Kidocode is a hybrid technology and entrepreneurship school for kids aged 5–18, offering both online and on-campus education. | 🥇 Gold |
+| | Singapore-based Aleph Null is Asia’s leading edtech hub, dedicated to student-centric, AI-driven education—empowering learners with the tools to thrive in a fast-changing world. | 🥇 Gold |
+
+
+
+### 🧑🤝 Individual Sponsors
+
+A heartfelt thanks to our individual supporters! Every contribution helps us keep our opensource mission alive and thriving!
+
+
+
+
+
+
+
+
+
+
+
+
+> Want to join them? [Sponsor Crawl4AI →](https://github.com/sponsors/unclecode)
+
## Star History
[](https://star-history.com/#unclecode/crawl4ai&Date)
diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index 6917f27e..8f1fdef4 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -103,7 +103,8 @@ from .browser_adapter import (
from .utils import (
start_colab_display_server,
- setup_colab_environment
+ setup_colab_environment,
+ hooks_to_string
)
__all__ = [
@@ -183,6 +184,7 @@ __all__ = [
"ProxyConfig",
"start_colab_display_server",
"setup_colab_environment",
+ "hooks_to_string",
# C4A Script additions
"c4a_compile",
"c4a_validate",
diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index b73a591d..e70e91c0 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,7 +1,7 @@
# crawl4ai/__version__.py
# This is the version that will be used for stable releases
-__version__ = "0.7.4"
+__version__ = "0.7.7"
# For nightly builds, this gets set during build process
__nightly_version__ = None
diff --git a/crawl4ai/adaptive_crawler.py b/crawl4ai/adaptive_crawler.py
index a0b8fa9c..bce1da23 100644
--- a/crawl4ai/adaptive_crawler.py
+++ b/crawl4ai/adaptive_crawler.py
@@ -19,7 +19,7 @@ import re
from pathlib import Path
from crawl4ai.async_webcrawler import AsyncWebCrawler
-from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig
+from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig, LLMConfig
from crawl4ai.models import Link, CrawlResult
import numpy as np
@@ -178,7 +178,7 @@ class AdaptiveConfig:
# Embedding strategy parameters
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
- embedding_llm_config: Optional[Dict] = None # Separate config for embeddings
+ embedding_llm_config: Optional[Union[LLMConfig, Dict]] = None # Separate config for embeddings
n_query_variations: int = 10
coverage_threshold: float = 0.85
alpha_shape_alpha: float = 0.5
@@ -250,6 +250,30 @@ class AdaptiveConfig:
assert 0 <= self.embedding_quality_max_confidence <= 1, "embedding_quality_max_confidence must be between 0 and 1"
assert self.embedding_quality_scale_factor > 0, "embedding_quality_scale_factor must be positive"
assert 0 <= self.embedding_min_confidence_threshold <= 1, "embedding_min_confidence_threshold must be between 0 and 1"
+
+ @property
+ def _embedding_llm_config_dict(self) -> Optional[Dict]:
+ """Convert LLMConfig to dict format for backward compatibility."""
+ if self.embedding_llm_config is None:
+ return None
+
+ if isinstance(self.embedding_llm_config, dict):
+ # Already a dict - return as-is for backward compatibility
+ return self.embedding_llm_config
+
+ # Convert LLMConfig object to dict format
+ return {
+ 'provider': self.embedding_llm_config.provider,
+ 'api_token': self.embedding_llm_config.api_token,
+ 'base_url': getattr(self.embedding_llm_config, 'base_url', None),
+ 'temperature': getattr(self.embedding_llm_config, 'temperature', None),
+ 'max_tokens': getattr(self.embedding_llm_config, 'max_tokens', None),
+ 'top_p': getattr(self.embedding_llm_config, 'top_p', None),
+ 'frequency_penalty': getattr(self.embedding_llm_config, 'frequency_penalty', None),
+ 'presence_penalty': getattr(self.embedding_llm_config, 'presence_penalty', None),
+ 'stop': getattr(self.embedding_llm_config, 'stop', None),
+ 'n': getattr(self.embedding_llm_config, 'n', None),
+ }
class CrawlStrategy(ABC):
@@ -593,7 +617,7 @@ class StatisticalStrategy(CrawlStrategy):
class EmbeddingStrategy(CrawlStrategy):
"""Embedding-based adaptive crawling using semantic space coverage"""
- def __init__(self, embedding_model: str = None, llm_config: Dict = None):
+ def __init__(self, embedding_model: str = None, llm_config: Union[LLMConfig, Dict] = None):
self.embedding_model = embedding_model or "sentence-transformers/all-MiniLM-L6-v2"
self.llm_config = llm_config
self._embedding_cache = {}
@@ -605,14 +629,24 @@ class EmbeddingStrategy(CrawlStrategy):
self._kb_embeddings_hash = None # Track KB changes
self._validation_embeddings_cache = None # Cache validation query embeddings
self._kb_similarity_threshold = 0.95 # Threshold for deduplication
+
+ def _get_embedding_llm_config_dict(self) -> Dict:
+ """Get embedding LLM config as dict with fallback to default."""
+ if hasattr(self, 'config') and self.config:
+ config_dict = self.config._embedding_llm_config_dict
+ if config_dict:
+ return config_dict
+
+ # Fallback to default if no config provided
+ return {
+ 'provider': 'openai/text-embedding-3-small',
+ 'api_token': os.getenv('OPENAI_API_KEY')
+ }
async def _get_embeddings(self, texts: List[str]) -> Any:
"""Get embeddings using configured method"""
from .utils import get_text_embeddings
- embedding_llm_config = {
- 'provider': 'openai/text-embedding-3-small',
- 'api_token': os.getenv('OPENAI_API_KEY')
- }
+ embedding_llm_config = self._get_embedding_llm_config_dict()
return await get_text_embeddings(
texts,
embedding_llm_config,
@@ -679,8 +713,20 @@ class EmbeddingStrategy(CrawlStrategy):
Return as a JSON array of strings."""
# Use the LLM for query generation
- provider = self.llm_config.get('provider', 'openai/gpt-4o-mini') if self.llm_config else 'openai/gpt-4o-mini'
- api_token = self.llm_config.get('api_token') if self.llm_config else None
+ # Convert LLMConfig to dict if needed
+ llm_config_dict = None
+ if self.llm_config:
+ if isinstance(self.llm_config, dict):
+ llm_config_dict = self.llm_config
+ else:
+ # Convert LLMConfig object to dict
+ llm_config_dict = {
+ 'provider': self.llm_config.provider,
+ 'api_token': self.llm_config.api_token
+ }
+
+ provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
+ api_token = llm_config_dict.get('api_token') if llm_config_dict else None
# response = perform_completion_with_backoff(
# provider=provider,
@@ -843,10 +889,7 @@ class EmbeddingStrategy(CrawlStrategy):
# Batch embed only uncached links
if texts_to_embed:
- embedding_llm_config = {
- 'provider': 'openai/text-embedding-3-small',
- 'api_token': os.getenv('OPENAI_API_KEY')
- }
+ embedding_llm_config = self._get_embedding_llm_config_dict()
new_embeddings = await get_text_embeddings(texts_to_embed, embedding_llm_config, self.embedding_model)
# Cache the new embeddings
@@ -1184,10 +1227,7 @@ class EmbeddingStrategy(CrawlStrategy):
return
# Get embeddings for new texts
- embedding_llm_config = {
- 'provider': 'openai/text-embedding-3-small',
- 'api_token': os.getenv('OPENAI_API_KEY')
- }
+ embedding_llm_config = self._get_embedding_llm_config_dict()
new_embeddings = await get_text_embeddings(new_texts, embedding_llm_config, self.embedding_model)
# Deduplicate embeddings before adding to KB
@@ -1256,10 +1296,12 @@ class AdaptiveCrawler:
if strategy_name == "statistical":
return StatisticalStrategy()
elif strategy_name == "embedding":
- return EmbeddingStrategy(
+ strategy = EmbeddingStrategy(
embedding_model=self.config.embedding_model,
llm_config=self.config.embedding_llm_config
)
+ strategy.config = self.config # Pass config to strategy
+ return strategy
else:
raise ValueError(f"Unknown strategy: {strategy_name}")
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index a43b50a4..bfa0d398 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1,5 +1,7 @@
import os
from typing import Union
+import warnings
+import requests
from .config import (
DEFAULT_PROVIDER,
DEFAULT_PROVIDER_API_KEY,
@@ -97,13 +99,16 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
if value != param.default and not ignore_default_value:
current_values[name] = to_serializable_dict(value)
- if hasattr(obj, '__slots__'):
- for slot in obj.__slots__:
- if slot.startswith('_'): # Handle private slots
- attr_name = slot[1:] # Remove leading '_'
- value = getattr(obj, slot, None)
- if value is not None:
- current_values[attr_name] = to_serializable_dict(value)
+ # Don't serialize private __slots__ - they're internal implementation details
+ # not constructor parameters. This was causing URLPatternFilter to fail
+ # because _simple_suffixes was being serialized as 'simple_suffixes'
+ # if hasattr(obj, '__slots__'):
+ # for slot in obj.__slots__:
+ # if slot.startswith('_'): # Handle private slots
+ # attr_name = slot[1:] # Remove leading '_'
+ # value = getattr(obj, slot, None)
+ # if value is not None:
+ # current_values[attr_name] = to_serializable_dict(value)
@@ -254,24 +259,39 @@ class ProxyConfig:
@staticmethod
def from_string(proxy_str: str) -> "ProxyConfig":
- """Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
- parts = proxy_str.split(":")
- if len(parts) == 4: # ip:port:username:password
+ """Create a ProxyConfig from a string.
+
+ Supported formats:
+ - 'http://username:password@ip:port'
+ - 'http://ip:port'
+ - 'socks5://ip:port'
+ - 'ip:port:username:password'
+ - 'ip:port'
+ """
+ s = (proxy_str or "").strip()
+ # URL with credentials
+ if "@" in s and "://" in s:
+ auth_part, server_part = s.split("@", 1)
+ protocol, credentials = auth_part.split("://", 1)
+ if ":" in credentials:
+ username, password = credentials.split(":", 1)
+ return ProxyConfig(
+ server=f"{protocol}://{server_part}",
+ username=username,
+ password=password,
+ )
+ # URL without credentials (keep scheme)
+ if "://" in s and "@" not in s:
+ return ProxyConfig(server=s)
+ # Colon separated forms
+ parts = s.split(":")
+ if len(parts) == 4:
ip, port, username, password = parts
- return ProxyConfig(
- server=f"http://{ip}:{port}",
- username=username,
- password=password,
- ip=ip
- )
- elif len(parts) == 2: # ip:port only
+ return ProxyConfig(server=f"http://{ip}:{port}", username=username, password=password)
+ if len(parts) == 2:
ip, port = parts
- return ProxyConfig(
- server=f"http://{ip}:{port}",
- ip=ip
- )
- else:
- raise ValueError(f"Invalid proxy string format: {proxy_str}")
+ return ProxyConfig(server=f"http://{ip}:{port}")
+ raise ValueError(f"Invalid proxy string format: {proxy_str}")
@staticmethod
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
@@ -435,6 +455,7 @@ class BrowserConfig:
host: str = "localhost",
enable_stealth: bool = False,
):
+
self.browser_type = browser_type
self.headless = headless
self.browser_mode = browser_mode
@@ -447,13 +468,22 @@ class BrowserConfig:
if self.browser_type in ["firefox", "webkit"]:
self.channel = ""
self.chrome_channel = ""
+ if proxy:
+ warnings.warn("The 'proxy' parameter is deprecated and will be removed in a future release. Use 'proxy_config' instead.", UserWarning)
self.proxy = proxy
self.proxy_config = proxy_config
if isinstance(self.proxy_config, dict):
self.proxy_config = ProxyConfig.from_dict(self.proxy_config)
if isinstance(self.proxy_config, str):
self.proxy_config = ProxyConfig.from_string(self.proxy_config)
-
+
+ if self.proxy and self.proxy_config:
+ warnings.warn("Both 'proxy' and 'proxy_config' are provided. 'proxy_config' will take precedence.", UserWarning)
+ self.proxy = None
+ elif self.proxy:
+ # Convert proxy string to ProxyConfig if proxy_config is not provided
+ self.proxy_config = ProxyConfig.from_string(self.proxy)
+ self.proxy = None
self.viewport_width = viewport_width
self.viewport_height = viewport_height
@@ -620,6 +650,85 @@ class BrowserConfig:
return config
return BrowserConfig.from_kwargs(config)
+ def set_nstproxy(
+ self,
+ token: str,
+ channel_id: str,
+ country: str = "ANY",
+ state: str = "",
+ city: str = "",
+ protocol: str = "http",
+ session_duration: int = 10,
+ ):
+ """
+ Fetch a proxy from NSTProxy API and automatically assign it to proxy_config.
+
+ Get your NSTProxy token from: https://app.nstproxy.com/profile
+
+ Args:
+ token (str): NSTProxy API token.
+ channel_id (str): NSTProxy channel ID.
+ country (str, optional): Country code (default: "ANY").
+ state (str, optional): State code (default: "").
+ city (str, optional): City name (default: "").
+ protocol (str, optional): Proxy protocol ("http" or "socks5"). Defaults to "http".
+ session_duration (int, optional): Session duration in minutes (0 = rotate each request). Defaults to 10.
+
+ Raises:
+ ValueError: If the API response format is invalid.
+ PermissionError: If the API returns an error message.
+ """
+
+ # --- Validate input early ---
+ if not token or not channel_id:
+ raise ValueError("[NSTProxy] token and channel_id are required")
+
+ if protocol not in ("http", "socks5"):
+ raise ValueError(f"[NSTProxy] Invalid protocol: {protocol}")
+
+ # --- Build NSTProxy API URL ---
+ params = {
+ "fType": 2,
+ "count": 1,
+ "channelId": channel_id,
+ "country": country,
+ "protocol": protocol,
+ "sessionDuration": session_duration,
+ "token": token,
+ }
+ if state:
+ params["state"] = state
+ if city:
+ params["city"] = city
+
+ url = "https://api.nstproxy.com/api/v1/generate/apiproxies"
+
+ try:
+ response = requests.get(url, params=params, timeout=10)
+ response.raise_for_status()
+
+ data = response.json()
+
+ # --- Handle API error response ---
+ if isinstance(data, dict) and data.get("err"):
+ raise PermissionError(f"[NSTProxy] API Error: {data.get('msg', 'Unknown error')}")
+
+ if not isinstance(data, list) or not data:
+ raise ValueError("[NSTProxy] Invalid API response — expected a non-empty list")
+
+ proxy_info = data[0]
+
+ # --- Apply proxy config ---
+ self.proxy_config = ProxyConfig(
+ server=f"{protocol}://{proxy_info['ip']}:{proxy_info['port']}",
+ username=proxy_info["username"],
+ password=proxy_info["password"],
+ )
+
+ except Exception as e:
+ print(f"[NSTProxy] ❌ Failed to set proxy: {e}")
+ raise
+
class VirtualScrollConfig:
"""Configuration for virtual scroll handling.
@@ -831,12 +940,6 @@ class HTTPCrawlerConfig:
return HTTPCrawlerConfig.from_kwargs(config)
class CrawlerRunConfig():
- _UNWANTED_PROPS = {
- 'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED',
- 'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS',
- 'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY',
- 'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY',
- }
"""
Configuration class for controlling how the crawler runs each crawl operation.
@@ -1043,6 +1146,12 @@ class CrawlerRunConfig():
url: str = None # This is not a compulsory parameter
"""
+ _UNWANTED_PROPS = {
+ 'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED',
+ 'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS',
+ 'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY',
+ 'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY',
+ }
def __init__(
self,
@@ -1121,6 +1230,7 @@ class CrawlerRunConfig():
exclude_domains: list = None,
exclude_internal_links: bool = False,
score_links: bool = False,
+ preserve_https_for_internal_links: bool = False,
# Debugging and Logging Parameters
verbose: bool = True,
log_console: bool = False,
@@ -1244,6 +1354,7 @@ class CrawlerRunConfig():
self.exclude_domains = exclude_domains or []
self.exclude_internal_links = exclude_internal_links
self.score_links = score_links
+ self.preserve_https_for_internal_links = preserve_https_for_internal_links
# Debugging and Logging Parameters
self.verbose = verbose
@@ -1517,6 +1628,7 @@ class CrawlerRunConfig():
exclude_domains=kwargs.get("exclude_domains", []),
exclude_internal_links=kwargs.get("exclude_internal_links", False),
score_links=kwargs.get("score_links", False),
+ preserve_https_for_internal_links=kwargs.get("preserve_https_for_internal_links", False),
# Debugging and Logging Parameters
verbose=kwargs.get("verbose", True),
log_console=kwargs.get("log_console", False),
@@ -1623,6 +1735,7 @@ class CrawlerRunConfig():
"exclude_domains": self.exclude_domains,
"exclude_internal_links": self.exclude_internal_links,
"score_links": self.score_links,
+ "preserve_https_for_internal_links": self.preserve_https_for_internal_links,
"verbose": self.verbose,
"log_console": self.log_console,
"capture_network_requests": self.capture_network_requests,
diff --git a/crawl4ai/async_crawler_strategy.back.py b/crawl4ai/async_crawler_strategy.back.py
index 9fdb0fe2..9f1ed38d 100644
--- a/crawl4ai/async_crawler_strategy.back.py
+++ b/crawl4ai/async_crawler_strategy.back.py
@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
except Error:
visibility_info = await self.check_visibility(page)
- if self.browser_config.config.verbose:
+ if self.browser_config.verbose:
self.logger.debug(
message="Body visibility info: {info}",
tag="DEBUG",
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 943867d0..76977bb9 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1383,9 +1383,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
try:
await self.adapter.evaluate(page,
f"""
- (() => {{
+ (async () => {{
try {{
- {remove_overlays_js}
+ const removeOverlays = {remove_overlays_js};
+ await removeOverlays();
return {{ success: true }};
}} catch (error) {{
return {{
diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py
index 5bb1a47c..bd44557c 100644
--- a/crawl4ai/async_dispatcher.py
+++ b/crawl4ai/async_dispatcher.py
@@ -455,8 +455,6 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
# Update priorities for waiting tasks if needed
await self._update_queue_priorities()
-
- return results
except Exception as e:
if self.monitor:
@@ -467,6 +465,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
memory_monitor.cancel()
if self.monitor:
self.monitor.stop()
+ return results
async def _update_queue_priorities(self):
"""Periodically update priorities of items in the queue to prevent starvation"""
diff --git a/crawl4ai/async_url_seeder.py b/crawl4ai/async_url_seeder.py
index d2564797..91f61837 100644
--- a/crawl4ai/async_url_seeder.py
+++ b/crawl4ai/async_url_seeder.py
@@ -845,6 +845,15 @@ class AsyncUrlSeeder:
return
data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
+ base_url = str(r.url)
+
+ def _normalize_loc(raw: Optional[str]) -> Optional[str]:
+ if not raw:
+ return None
+ normalized = urljoin(base_url, raw.strip())
+ if not normalized:
+ return None
+ return normalized
# Detect if this is a sitemap index by checking for or presence of elements
is_sitemap_index = False
@@ -857,25 +866,42 @@ class AsyncUrlSeeder:
# Use XML parser for sitemaps, not HTML parser
parser = etree.XMLParser(recover=True)
root = etree.fromstring(data, parser=parser)
+ # Namespace-agnostic lookups using local-name() so we honor custom or missing namespaces
+ sitemap_loc_nodes = root.xpath("//*[local-name()='sitemap']/*[local-name()='loc']")
+ url_loc_nodes = root.xpath("//*[local-name()='url']/*[local-name()='loc']")
- # Define namespace for sitemap
- ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
+ self._log(
+ "debug",
+ "Parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
+ params={
+ "url": url,
+ "sitemap_count": len(sitemap_loc_nodes),
+ "url_count": len(url_loc_nodes),
+ },
+ tag="URL_SEED",
+ )
# Check for sitemap index entries
- sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns)
- if sitemap_locs:
+ if sitemap_loc_nodes:
is_sitemap_index = True
- for sitemap_elem in sitemap_locs:
- loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
+ for sitemap_elem in sitemap_loc_nodes:
+ loc = _normalize_loc(sitemap_elem.text)
if loc:
sub_sitemaps.append(loc)
# If not a sitemap index, get regular URLs
if not is_sitemap_index:
- for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
- loc = loc_elem.text.strip() if loc_elem.text else ""
+ for loc_elem in url_loc_nodes:
+ loc = _normalize_loc(loc_elem.text)
if loc:
regular_urls.append(loc)
+ if not regular_urls:
+ self._log(
+ "warning",
+ "No entries found inside tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
+ params={"url": url},
+ tag="URL_SEED",
+ )
except Exception as e:
self._log("error", "LXML parsing error for sitemap {url}: {error}",
params={"url": url, "error": str(e)}, tag="URL_SEED")
@@ -892,19 +918,39 @@ class AsyncUrlSeeder:
# Check for sitemap index entries
sitemaps = root.findall('.//sitemap')
+ url_entries = root.findall('.//url')
+ self._log(
+ "debug",
+ "ElementTree parsed sitemap {url}: {sitemap_count} sitemap entries, {url_count} url entries discovered",
+ params={
+ "url": url,
+ "sitemap_count": len(sitemaps),
+ "url_count": len(url_entries),
+ },
+ tag="URL_SEED",
+ )
if sitemaps:
is_sitemap_index = True
for sitemap in sitemaps:
loc_elem = sitemap.find('loc')
- if loc_elem is not None and loc_elem.text:
- sub_sitemaps.append(loc_elem.text.strip())
+ loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
+ if loc:
+ sub_sitemaps.append(loc)
# If not a sitemap index, get regular URLs
if not is_sitemap_index:
- for url_elem in root.findall('.//url'):
+ for url_elem in url_entries:
loc_elem = url_elem.find('loc')
- if loc_elem is not None and loc_elem.text:
- regular_urls.append(loc_elem.text.strip())
+ loc = _normalize_loc(loc_elem.text if loc_elem is not None else None)
+ if loc:
+ regular_urls.append(loc)
+ if not regular_urls:
+ self._log(
+ "warning",
+ "No entries found inside tags for sitemap {url}. The sitemap might be empty or use an unexpected structure.",
+ params={"url": url},
+ tag="URL_SEED",
+ )
except Exception as e:
self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
params={"url": url, "error": str(e)}, tag="URL_SEED")
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 359aa73c..4dc52adc 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -354,6 +354,7 @@ class AsyncWebCrawler:
###############################################################
# Process the HTML content, Call CrawlerStrategy.process_html #
###############################################################
+ from urllib.parse import urlparse
crawl_result: CrawlResult = await self.aprocess_html(
url=url,
html=html,
@@ -364,6 +365,7 @@ class AsyncWebCrawler:
verbose=config.verbose,
is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url,
+ original_scheme=urlparse(url).scheme,
**kwargs,
)
@@ -615,7 +617,17 @@ class AsyncWebCrawler:
else config.chunking_strategy
)
sections = chunking.chunk(content)
- extracted_content = config.extraction_strategy.run(_url, sections)
+ # extracted_content = config.extraction_strategy.run(_url, sections)
+
+ # Use async version if available for better parallelism
+ if hasattr(config.extraction_strategy, 'arun'):
+ extracted_content = await config.extraction_strategy.arun(_url, sections)
+ else:
+ # Fallback to sync version run in thread pool to avoid blocking
+ extracted_content = await asyncio.to_thread(
+ config.extraction_strategy.run, url, sections
+ )
+
extracted_content = json.dumps(
extracted_content, indent=4, default=str, ensure_ascii=False
)
diff --git a/crawl4ai/browser_adapter.py b/crawl4ai/browser_adapter.py
index 85fef16e..3d3f5cdc 100644
--- a/crawl4ai/browser_adapter.py
+++ b/crawl4ai/browser_adapter.py
@@ -148,6 +148,134 @@ class PlaywrightAdapter(BrowserAdapter):
return Page, Error, PlaywrightTimeoutError
+class StealthAdapter(BrowserAdapter):
+ """Adapter for Playwright with stealth features using playwright_stealth"""
+
+ def __init__(self):
+ self._console_script_injected = {}
+ self._stealth_available = self._check_stealth_availability()
+
+ def _check_stealth_availability(self) -> bool:
+ """Check if playwright_stealth is available and get the correct function"""
+ try:
+ from playwright_stealth import stealth_async
+ self._stealth_function = stealth_async
+ return True
+ except ImportError:
+ try:
+ from playwright_stealth import stealth_sync
+ self._stealth_function = stealth_sync
+ return True
+ except ImportError:
+ self._stealth_function = None
+ return False
+
+ async def apply_stealth(self, page: Page):
+ """Apply stealth to a page if available"""
+ if self._stealth_available and self._stealth_function:
+ try:
+ if hasattr(self._stealth_function, '__call__'):
+ if 'async' in getattr(self._stealth_function, '__name__', ''):
+ await self._stealth_function(page)
+ else:
+ self._stealth_function(page)
+ except Exception as e:
+ # Fail silently or log error depending on requirements
+ pass
+
+ async def evaluate(self, page: Page, expression: str, arg: Any = None) -> Any:
+ """Standard Playwright evaluate with stealth applied"""
+ if arg is not None:
+ return await page.evaluate(expression, arg)
+ return await page.evaluate(expression)
+
+ async def setup_console_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
+ """Setup console capture using Playwright's event system with stealth"""
+ # Apply stealth to the page first
+ await self.apply_stealth(page)
+
+ def handle_console_capture(msg):
+ try:
+ message_type = "unknown"
+ try:
+ message_type = msg.type
+ except:
+ pass
+
+ message_text = "unknown"
+ try:
+ message_text = msg.text
+ except:
+ pass
+
+ entry = {
+ "type": message_type,
+ "text": message_text,
+ "timestamp": time.time()
+ }
+
+ captured_console.append(entry)
+
+ except Exception as e:
+ captured_console.append({
+ "type": "console_capture_error",
+ "error": str(e),
+ "timestamp": time.time()
+ })
+
+ page.on("console", handle_console_capture)
+ return handle_console_capture
+
+ async def setup_error_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
+ """Setup error capture using Playwright's event system"""
+ def handle_pageerror_capture(err):
+ try:
+ error_message = "Unknown error"
+ try:
+ error_message = err.message
+ except:
+ pass
+
+ error_stack = ""
+ try:
+ error_stack = err.stack
+ except:
+ pass
+
+ captured_console.append({
+ "type": "error",
+ "text": error_message,
+ "stack": error_stack,
+ "timestamp": time.time()
+ })
+ except Exception as e:
+ captured_console.append({
+ "type": "pageerror_capture_error",
+ "error": str(e),
+ "timestamp": time.time()
+ })
+
+ page.on("pageerror", handle_pageerror_capture)
+ return handle_pageerror_capture
+
+ async def retrieve_console_messages(self, page: Page) -> List[Dict]:
+ """Not needed for Playwright - messages are captured via events"""
+ return []
+
+ async def cleanup_console_capture(self, page: Page, handle_console: Optional[Callable], handle_error: Optional[Callable]):
+ """Remove event listeners"""
+ if handle_console:
+ page.remove_listener("console", handle_console)
+ if handle_error:
+ page.remove_listener("pageerror", handle_error)
+
+ def get_imports(self) -> tuple:
+ """Return Playwright imports"""
+ from playwright.async_api import Page, Error
+ from playwright.async_api import TimeoutError as PlaywrightTimeoutError
+ return Page, Error, PlaywrightTimeoutError
+
+
class UndetectedAdapter(BrowserAdapter):
"""Adapter for undetected browser automation with stealth features"""
diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
index 8fed970c..3ca96aed 100644
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -15,6 +15,7 @@ from .js_snippet import load_js_script
from .config import DOWNLOAD_PAGE_TIMEOUT
from .async_configs import BrowserConfig, CrawlerRunConfig
from .utils import get_chromium_path
+import warnings
BROWSER_DISABLE_OPTIONS = [
@@ -368,6 +369,9 @@ class ManagedBrowser:
]
if self.headless:
flags.append("--headless=new")
+ # Add viewport flag if specified in config
+ if self.browser_config.viewport_height and self.browser_config.viewport_width:
+ flags.append(f"--window-size={self.browser_config.viewport_width},{self.browser_config.viewport_height}")
# merge common launch flags
flags.extend(self.build_browser_flags(self.browser_config))
elif self.browser_type == "firefox":
@@ -613,9 +617,11 @@ class BrowserManager:
# for all racers). Prevents 'Target page/context closed' errors.
self._page_lock = asyncio.Lock()
- # Stealth-related attributes
- self._stealth_instance = None
- self._stealth_cm = None
+ # Stealth adapter for stealth mode
+ self._stealth_adapter = None
+ if self.config.enable_stealth and not self.use_undetected:
+ from .browser_adapter import StealthAdapter
+ self._stealth_adapter = StealthAdapter()
# Initialize ManagedBrowser if needed
if self.config.use_managed_browser:
@@ -649,20 +655,17 @@ class BrowserManager:
else:
from playwright.async_api import async_playwright
- # Initialize playwright with or without stealth
- if self.config.enable_stealth and not self.use_undetected:
- # Import stealth only when needed
- from playwright_stealth import Stealth
- # Use the recommended stealth wrapper approach
- self._stealth_instance = Stealth()
- self._stealth_cm = self._stealth_instance.use_async(async_playwright())
- self.playwright = await self._stealth_cm.__aenter__()
- else:
- self.playwright = await async_playwright().start()
+ # Initialize playwright
+ self.playwright = await async_playwright().start()
if self.config.cdp_url or self.config.use_managed_browser:
self.config.use_managed_browser = True
cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
+
+ # Add CDP endpoint verification before connecting
+ if not await self._verify_cdp_ready(cdp_url):
+ raise Exception(f"CDP endpoint at {cdp_url} is not ready after startup")
+
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
contexts = self.browser.contexts
if contexts:
@@ -683,6 +686,24 @@ class BrowserManager:
self.default_context = self.browser
+ async def _verify_cdp_ready(self, cdp_url: str) -> bool:
+ """Verify CDP endpoint is ready with exponential backoff"""
+ import aiohttp
+ self.logger.debug(f"Starting CDP verification for {cdp_url}", tag="BROWSER")
+ for attempt in range(5):
+ try:
+ async with aiohttp.ClientSession() as session:
+ async with session.get(f"{cdp_url}/json/version", timeout=aiohttp.ClientTimeout(total=2)) as response:
+ if response.status == 200:
+ self.logger.debug(f"CDP endpoint ready after {attempt + 1} attempts", tag="BROWSER")
+ return True
+ except Exception as e:
+ self.logger.debug(f"CDP check attempt {attempt + 1} failed: {e}", tag="BROWSER")
+ delay = 0.5 * (1.4 ** attempt)
+ self.logger.debug(f"Waiting {delay:.2f}s before next CDP check...", tag="BROWSER")
+ await asyncio.sleep(delay)
+ self.logger.debug(f"CDP verification failed after 5 attempts", tag="BROWSER")
+ return False
def _build_browser_args(self) -> dict:
"""Build browser launch arguments from config."""
@@ -741,17 +762,18 @@ class BrowserManager:
)
os.makedirs(browser_args["downloads_path"], exist_ok=True)
- if self.config.proxy or self.config.proxy_config:
+ if self.config.proxy:
+ warnings.warn(
+ "BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
+ DeprecationWarning,
+ )
+ if self.config.proxy_config:
from playwright.async_api import ProxySettings
- proxy_settings = (
- ProxySettings(server=self.config.proxy)
- if self.config.proxy
- else ProxySettings(
- server=self.config.proxy_config.server,
- username=self.config.proxy_config.username,
- password=self.config.proxy_config.password,
- )
+ proxy_settings = ProxySettings(
+ server=self.config.proxy_config.server,
+ username=self.config.proxy_config.username,
+ password=self.config.proxy_config.password,
)
browser_args["proxy"] = proxy_settings
@@ -1007,6 +1029,19 @@ class BrowserManager:
signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
return signature_hash
+ async def _apply_stealth_to_page(self, page):
+ """Apply stealth to a page if stealth mode is enabled"""
+ if self._stealth_adapter:
+ try:
+ await self._stealth_adapter.apply_stealth(page)
+ except Exception as e:
+ if self.logger:
+ self.logger.warning(
+ message="Failed to apply stealth to page: {error}",
+ tag="STEALTH",
+ params={"error": str(e)}
+ )
+
async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
"""
Get a page for the given session ID, creating a new one if needed.
@@ -1036,6 +1071,7 @@ class BrowserManager:
# See GH-1198: context.pages can be empty under races
async with self._page_lock:
page = await ctx.new_page()
+ await self._apply_stealth_to_page(page)
else:
context = self.default_context
pages = context.pages
@@ -1052,6 +1088,7 @@ class BrowserManager:
page = pages[0]
else:
page = await context.new_page()
+ await self._apply_stealth_to_page(page)
else:
# Otherwise, check if we have an existing context for this config
config_signature = self._make_config_signature(crawlerRunConfig)
@@ -1067,6 +1104,7 @@ class BrowserManager:
# Create a new page from the chosen context
page = await context.new_page()
+ await self._apply_stealth_to_page(page)
# If a session_id is specified, store this session so we can reuse later
if crawlerRunConfig.session_id:
@@ -1133,19 +1171,5 @@ class BrowserManager:
self.managed_browser = None
if self.playwright:
- # Handle stealth context manager cleanup if it exists
- if hasattr(self, '_stealth_cm') and self._stealth_cm is not None:
- try:
- await self._stealth_cm.__aexit__(None, None, None)
- except Exception as e:
- if self.logger:
- self.logger.error(
- message="Error closing stealth context: {error}",
- tag="ERROR",
- params={"error": str(e)}
- )
- self._stealth_cm = None
- self._stealth_instance = None
- else:
- await self.playwright.stop()
+ await self.playwright.stop()
self.playwright = None
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 9ef0e616..e915ff5b 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -258,7 +258,11 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
continue
try:
- normalized_href = normalize_url(href, url)
+ normalized_href = normalize_url(
+ href, url,
+ preserve_https=kwargs.get('preserve_https_for_internal_links', False),
+ original_scheme=kwargs.get('original_scheme')
+ )
link_data = {
"href": normalized_href,
"text": link.text_content().strip(),
@@ -538,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
if el.tag in bypass_tags:
continue
+ # Skip elements inside
or tags where whitespace is significant
+ # This preserves whitespace-only spans (e.g., ) in code blocks
+ is_in_code_block = False
+ ancestor = el.getparent()
+ while ancestor is not None:
+ if ancestor.tag in ("pre", "code"):
+ is_in_code_block = True
+ break
+ ancestor = ancestor.getparent()
+
+ if is_in_code_block:
+ continue
+
text_content = (el.text_content() or "").strip()
if (
len(text_content.split()) < word_count_threshold
diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py
index 7779c9f4..58209bcb 100644
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -47,7 +47,13 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
self.url_scorer = url_scorer
self.include_external = include_external
self.max_pages = max_pages
- self.logger = logger or logging.getLogger(__name__)
+ # self.logger = logger or logging.getLogger(__name__)
+ # Ensure logger is always a Logger instance, not a dict from serialization
+ if isinstance(logger, logging.Logger):
+ self.logger = logger
+ else:
+ # Create a new logger if logger is None, dict, or any other non-Logger type
+ self.logger = logging.getLogger(__name__)
self.stats = TraversalStats(start_time=datetime.now())
self._cancel_event = asyncio.Event()
self._pages_crawled = 0
@@ -116,11 +122,6 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
valid_links.append(base_url)
- # If we have more valid links than capacity, limit them
- if len(valid_links) > remaining_capacity:
- valid_links = valid_links[:remaining_capacity]
- self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
-
# Record the new depths and add to next_links
for url in valid_links:
depths[url] = new_depth
@@ -140,7 +141,8 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
"""
queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
# Push the initial URL with score 0 and depth 0.
- await queue.put((0, 0, start_url, None))
+ initial_score = self.url_scorer.score(start_url) if self.url_scorer else 0
+ await queue.put((-initial_score, 0, start_url, None))
visited: Set[str] = set()
depths: Dict[str, int] = {start_url: 0}
@@ -187,7 +189,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
result.metadata = result.metadata or {}
result.metadata["depth"] = depth
result.metadata["parent_url"] = parent_url
- result.metadata["score"] = score
+ result.metadata["score"] = -score
# Count only successful crawls toward max_pages limit
if result.success:
@@ -208,7 +210,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
for new_url, new_parent in new_links:
new_depth = depths.get(new_url, depth + 1)
new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
- await queue.put((new_score, new_depth, new_url, new_parent))
+ await queue.put((-new_score, new_depth, new_url, new_parent))
# End of crawl.
diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py
index 950c3980..eb699f82 100644
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -38,7 +38,13 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
self.include_external = include_external
self.score_threshold = score_threshold
self.max_pages = max_pages
- self.logger = logger or logging.getLogger(__name__)
+ # self.logger = logger or logging.getLogger(__name__)
+ # Ensure logger is always a Logger instance, not a dict from serialization
+ if isinstance(logger, logging.Logger):
+ self.logger = logger
+ else:
+ # Create a new logger if logger is None, dict, or any other non-Logger type
+ self.logger = logging.getLogger(__name__)
self.stats = TraversalStats(start_time=datetime.now())
self._cancel_event = asyncio.Event()
self._pages_crawled = 0
diff --git a/crawl4ai/deep_crawling/dfs_strategy.py b/crawl4ai/deep_crawling/dfs_strategy.py
index 0eca58e3..c710a2a5 100644
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -4,14 +4,26 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
from ..models import CrawlResult
from .bfs_strategy import BFSDeepCrawlStrategy # noqa
from ..types import AsyncWebCrawler, CrawlerRunConfig
+from ..utils import normalize_url_for_deep_crawl
class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
"""
- Depth-First Search (DFS) deep crawling strategy.
+ Depth-first deep crawling with familiar BFS rules.
- Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
- Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
+ We reuse the same filters, scoring, and page limits from :class:`BFSDeepCrawlStrategy`,
+ but walk the graph with a stack so we fully explore one branch before hopping to the
+ next. DFS also keeps its own ``_dfs_seen`` set so we can drop duplicate links at
+ discovery time without accidentally marking them as “already crawled”.
"""
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self._dfs_seen: Set[str] = set()
+
+ def _reset_seen(self, start_url: str) -> None:
+ """Start each crawl with a clean dedupe set seeded with the root URL."""
+ self._dfs_seen = {start_url}
+
async def _arun_batch(
self,
start_url: str,
@@ -19,14 +31,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
config: CrawlerRunConfig,
) -> List[CrawlResult]:
"""
- Batch (non-streaming) DFS mode.
- Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
+ Crawl level-by-level but emit results at the end.
+
+ We keep a stack of ``(url, parent, depth)`` tuples, pop one at a time, and
+ hand it to ``crawler.arun_many`` with deep crawling disabled so we remain
+ in control of traversal. Every successful page bumps ``_pages_crawled`` and
+ seeds new stack items discovered via :meth:`link_discovery`.
"""
visited: Set[str] = set()
# Stack items: (url, parent_url, depth)
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0}
results: List[CrawlResult] = []
+ self._reset_seen(start_url)
while stack and not self._cancel_event.is_set():
url, parent, depth = stack.pop()
@@ -71,12 +88,16 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
config: CrawlerRunConfig,
) -> AsyncGenerator[CrawlResult, None]:
"""
- Streaming DFS mode.
- Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
+ Same traversal as :meth:`_arun_batch`, but yield pages immediately.
+
+ Each popped URL is crawled, its metadata annotated, then the result gets
+ yielded before we even look at the next stack entry. Successful crawls
+ still feed :meth:`link_discovery`, keeping DFS order intact.
"""
visited: Set[str] = set()
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0}
+ self._reset_seen(start_url)
while stack and not self._cancel_event.is_set():
url, parent, depth = stack.pop()
@@ -108,3 +129,92 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
for new_url, new_parent in reversed(new_links):
new_depth = depths.get(new_url, depth + 1)
stack.append((new_url, new_parent, new_depth))
+
+ async def link_discovery(
+ self,
+ result: CrawlResult,
+ source_url: str,
+ current_depth: int,
+ _visited: Set[str],
+ next_level: List[Tuple[str, Optional[str]]],
+ depths: Dict[str, int],
+ ) -> None:
+ """
+ Find the next URLs we should push onto the DFS stack.
+
+ Parameters
+ ----------
+ result : CrawlResult
+ Output of the page we just crawled; its ``links`` block is our raw material.
+ source_url : str
+ URL of the parent page; stored so callers can track ancestry.
+ current_depth : int
+ Depth of the parent; children naturally sit at ``current_depth + 1``.
+ _visited : Set[str]
+ Present to match the BFS signature, but we rely on ``_dfs_seen`` instead.
+ next_level : list of tuples
+ The stack buffer supplied by the caller; we append new ``(url, parent)`` items here.
+ depths : dict
+ Shared depth map so future metadata tagging knows how deep each URL lives.
+
+ Notes
+ -----
+ - ``_dfs_seen`` keeps us from pushing duplicates without touching the traversal guard.
+ - Validation, scoring, and capacity trimming mirror the BFS version so behaviour stays consistent.
+ """
+ next_depth = current_depth + 1
+ if next_depth > self.max_depth:
+ return
+
+ remaining_capacity = self.max_pages - self._pages_crawled
+ if remaining_capacity <= 0:
+ self.logger.info(
+ f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
+ )
+ return
+
+ links = result.links.get("internal", [])
+ if self.include_external:
+ links += result.links.get("external", [])
+
+ seen = self._dfs_seen
+ valid_links: List[Tuple[str, float]] = []
+
+ for link in links:
+ raw_url = link.get("href")
+ if not raw_url:
+ continue
+
+ normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
+ if not normalized_url or normalized_url in seen:
+ continue
+
+ if not await self.can_process_url(raw_url, next_depth):
+ self.stats.urls_skipped += 1
+ continue
+
+ score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
+ if score < self.score_threshold:
+ self.logger.debug(
+ f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
+ )
+ self.stats.urls_skipped += 1
+ continue
+
+ seen.add(normalized_url)
+ valid_links.append((normalized_url, score))
+
+ if len(valid_links) > remaining_capacity:
+ if self.url_scorer:
+ valid_links.sort(key=lambda x: x[1], reverse=True)
+ valid_links = valid_links[:remaining_capacity]
+ self.logger.info(
+ f"Limiting to {remaining_capacity} URLs due to max_pages limit"
+ )
+
+ for url, score in valid_links:
+ if score:
+ result.metadata = result.metadata or {}
+ result.metadata["score"] = score
+ next_level.append((url, source_url))
+ depths[url] = next_depth
diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py
index b65112e2..981cbcd8 100644
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -120,6 +120,9 @@ class URLPatternFilter(URLFilter):
"""Pattern filter balancing speed and completeness"""
__slots__ = (
+ "patterns", # Store original patterns for serialization
+ "use_glob", # Store original use_glob for serialization
+ "reverse", # Store original reverse for serialization
"_simple_suffixes",
"_simple_prefixes",
"_domain_patterns",
@@ -142,6 +145,11 @@ class URLPatternFilter(URLFilter):
reverse: bool = False,
):
super().__init__()
+ # Store original constructor params for serialization
+ self.patterns = patterns
+ self.use_glob = use_glob
+ self.reverse = reverse
+
self._reverse = reverse
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
diff --git a/crawl4ai/docker_client.py b/crawl4ai/docker_client.py
index 4e33431f..969fee7c 100644
--- a/crawl4ai/docker_client.py
+++ b/crawl4ai/docker_client.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Union, AsyncGenerator, Dict, Any
+from typing import List, Optional, Union, AsyncGenerator, Dict, Any, Callable
import httpx
import json
from urllib.parse import urljoin
@@ -7,6 +7,7 @@ import asyncio
from .async_configs import BrowserConfig, CrawlerRunConfig
from .models import CrawlResult
from .async_logger import AsyncLogger, LogLevel
+from .utils import hooks_to_string
class Crawl4aiClientError(Exception):
@@ -70,17 +71,41 @@ class Crawl4aiDockerClient:
self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
raise ConnectionError(f"Cannot connect to server: {str(e)}")
- def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
- crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
+ def _prepare_request(
+ self,
+ urls: List[str],
+ browser_config: Optional[BrowserConfig] = None,
+ crawler_config: Optional[CrawlerRunConfig] = None,
+ hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
+ hooks_timeout: int = 30
+ ) -> Dict[str, Any]:
"""Prepare request data from configs."""
if self._token:
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
- return {
+
+ request_data = {
"urls": urls,
"browser_config": browser_config.dump() if browser_config else {},
"crawler_config": crawler_config.dump() if crawler_config else {}
}
+ # Handle hooks if provided
+ if hooks:
+ # Check if hooks are already strings or need conversion
+ if any(callable(v) for v in hooks.values()):
+ # Convert function objects to strings
+ hooks_code = hooks_to_string(hooks)
+ else:
+ # Already in string format
+ hooks_code = hooks
+
+ request_data["hooks"] = {
+ "code": hooks_code,
+ "timeout": hooks_timeout
+ }
+
+ return request_data
+
async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
"""Make an HTTP request with error handling."""
url = urljoin(self.base_url, endpoint)
@@ -102,16 +127,42 @@ class Crawl4aiDockerClient:
self,
urls: List[str],
browser_config: Optional[BrowserConfig] = None,
- crawler_config: Optional[CrawlerRunConfig] = None
+ crawler_config: Optional[CrawlerRunConfig] = None,
+ hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
+ hooks_timeout: int = 30
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
- """Execute a crawl operation."""
+ """
+ Execute a crawl operation.
+
+ Args:
+ urls: List of URLs to crawl
+ browser_config: Browser configuration
+ crawler_config: Crawler configuration
+ hooks: Optional hooks - can be either:
+ - Dict[str, Callable]: Function objects that will be converted to strings
+ - Dict[str, str]: Already stringified hook code
+ hooks_timeout: Timeout in seconds for each hook execution (1-120)
+
+ Returns:
+ Single CrawlResult, list of results, or async generator for streaming
+
+ Example with function hooks:
+ >>> async def my_hook(page, context, **kwargs):
+ ... await page.set_viewport_size({"width": 1920, "height": 1080})
+ ... return page
+ >>>
+ >>> result = await client.crawl(
+ ... ["https://example.com"],
+ ... hooks={"on_page_context_created": my_hook}
+ ... )
+ """
await self._check_server()
-
- data = self._prepare_request(urls, browser_config, crawler_config)
+
+ data = self._prepare_request(urls, browser_config, crawler_config, hooks, hooks_timeout)
is_streaming = crawler_config and crawler_config.stream
-
+
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
-
+
if is_streaming:
async def stream_results() -> AsyncGenerator[CrawlResult, None]:
async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
@@ -128,12 +179,12 @@ class Crawl4aiDockerClient:
else:
yield CrawlResult(**result)
return stream_results()
-
+
response = await self._request("POST", "/crawl", json=data)
result_data = response.json()
if not result_data.get("success", False):
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
-
+
results = [CrawlResult(**r) for r in result_data.get("results", [])]
self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
return results[0] if len(results) == 1 else results
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 380f83b4..4a64e5d4 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -94,6 +94,20 @@ class ExtractionStrategy(ABC):
extracted_content.extend(future.result())
return extracted_content
+ async def arun(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+ """
+ Async version: Process sections of text in parallel using asyncio.
+
+ Default implementation runs the sync version in a thread pool.
+ Subclasses can override this for true async processing.
+
+ :param url: The URL of the webpage.
+ :param sections: List of sections (strings) to process.
+ :return: A list of processed JSON blocks.
+ """
+ import asyncio
+ return await asyncio.to_thread(self.run, url, sections, *q, **kwargs)
+
class NoExtractionStrategy(ExtractionStrategy):
"""
@@ -780,6 +794,177 @@ class LLMExtractionStrategy(ExtractionStrategy):
return extracted_content
+ async def aextract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
+ """
+ Async version: Extract meaningful blocks or chunks from the given HTML using an LLM.
+
+ How it works:
+ 1. Construct a prompt with variables.
+ 2. Make an async request to the LLM using the prompt.
+ 3. Parse the response and extract blocks or chunks.
+
+ Args:
+ url: The URL of the webpage.
+ ix: Index of the block.
+ html: The HTML content of the webpage.
+
+ Returns:
+ A list of extracted blocks or chunks.
+ """
+ from .utils import aperform_completion_with_backoff
+
+ if self.verbose:
+ print(f"[LOG] Call LLM for {url} - block index: {ix}")
+
+ variable_values = {
+ "URL": url,
+ "HTML": escape_json_string(sanitize_html(html)),
+ }
+
+ prompt_with_variables = PROMPT_EXTRACT_BLOCKS
+ if self.instruction:
+ variable_values["REQUEST"] = self.instruction
+ prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
+
+ if self.extract_type == "schema" and self.schema:
+ variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
+ prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
+
+ if self.extract_type == "schema" and not self.schema:
+ prompt_with_variables = PROMPT_EXTRACT_INFERRED_SCHEMA
+
+ for variable in variable_values:
+ prompt_with_variables = prompt_with_variables.replace(
+ "{" + variable + "}", variable_values[variable]
+ )
+
+ try:
+ response = await aperform_completion_with_backoff(
+ self.llm_config.provider,
+ prompt_with_variables,
+ self.llm_config.api_token,
+ base_url=self.llm_config.base_url,
+ json_response=self.force_json_response,
+ extra_args=self.extra_args,
+ )
+ # Track usage
+ usage = TokenUsage(
+ completion_tokens=response.usage.completion_tokens,
+ prompt_tokens=response.usage.prompt_tokens,
+ total_tokens=response.usage.total_tokens,
+ completion_tokens_details=response.usage.completion_tokens_details.__dict__
+ if response.usage.completion_tokens_details
+ else {},
+ prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
+ if response.usage.prompt_tokens_details
+ else {},
+ )
+ self.usages.append(usage)
+
+ # Update totals
+ self.total_usage.completion_tokens += usage.completion_tokens
+ self.total_usage.prompt_tokens += usage.prompt_tokens
+ self.total_usage.total_tokens += usage.total_tokens
+
+ try:
+ content = response.choices[0].message.content
+ blocks = None
+
+ if self.force_json_response:
+ blocks = json.loads(content)
+ if isinstance(blocks, dict):
+ if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
+ blocks = list(blocks.values())[0]
+ else:
+ blocks = [blocks]
+ elif isinstance(blocks, list):
+ blocks = blocks
+ else:
+ blocks = extract_xml_data(["blocks"], content)["blocks"]
+ blocks = json.loads(blocks)
+
+ for block in blocks:
+ block["error"] = False
+ except Exception:
+ parsed, unparsed = split_and_parse_json_objects(
+ response.choices[0].message.content
+ )
+ blocks = parsed
+ if unparsed:
+ blocks.append(
+ {"index": 0, "error": True, "tags": ["error"], "content": unparsed}
+ )
+
+ if self.verbose:
+ print(
+ "[LOG] Extracted",
+ len(blocks),
+ "blocks from URL:",
+ url,
+ "block index:",
+ ix,
+ )
+ return blocks
+ except Exception as e:
+ if self.verbose:
+ print(f"[LOG] Error in LLM extraction: {e}")
+ return [
+ {
+ "index": ix,
+ "error": True,
+ "tags": ["error"],
+ "content": str(e),
+ }
+ ]
+
+ async def arun(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
+ """
+ Async version: Process sections with true parallelism using asyncio.gather.
+
+ Args:
+ url: The URL of the webpage.
+ sections: List of sections (strings) to process.
+
+ Returns:
+ A list of extracted blocks or chunks.
+ """
+ import asyncio
+
+ merged_sections = self._merge(
+ sections,
+ self.chunk_token_threshold,
+ overlap=int(self.chunk_token_threshold * self.overlap_rate),
+ )
+
+ extracted_content = []
+
+ # Create tasks for all sections to run in parallel
+ tasks = [
+ self.aextract(url, ix, sanitize_input_encode(section))
+ for ix, section in enumerate(merged_sections)
+ ]
+
+ # Execute all tasks concurrently
+ results = await asyncio.gather(*tasks, return_exceptions=True)
+
+ # Process results
+ for result in results:
+ if isinstance(result, Exception):
+ if self.verbose:
+ print(f"Error in async extraction: {result}")
+ extracted_content.append(
+ {
+ "index": 0,
+ "error": True,
+ "tags": ["error"],
+ "content": str(result),
+ }
+ )
+ else:
+ extracted_content.extend(result)
+
+ return extracted_content
+
def show_usage(self) -> None:
"""Print a detailed token usage report showing total and per-request usage."""
print("\n=== Token Usage Summary ===")
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index 640c2f2d..63e39885 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -253,6 +253,16 @@ class CrawlResult(BaseModel):
requirements change, this is where you would update the logic.
"""
result = super().model_dump(*args, **kwargs)
+
+ # Remove any property descriptors that might have been included
+ # These deprecated properties should not be in the serialized output
+ for key in ['fit_html', 'fit_markdown', 'markdown_v2']:
+ if key in result and isinstance(result[key], property):
+ # del result[key]
+ # Nasrin: I decided to convert it to string instead of removing it.
+ result[key] = str(result[key])
+
+ # Add the markdown field properly
if self._markdown is not None:
result["markdown"] = self._markdown.model_dump()
return result
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 73f1d2a3..68a343fb 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -47,6 +47,7 @@ from urllib.parse import (
urljoin, urlparse, urlunparse,
parse_qsl, urlencode, quote, unquote
)
+import inspect
# Monkey patch to fix wildcard handling in urllib.robotparser
@@ -1790,6 +1791,10 @@ def perform_completion_with_backoff(
except RateLimitError as e:
print("Rate limit error:", str(e))
+ if attempt == max_attempts - 1:
+ # Last attempt failed, raise the error.
+ raise
+
# Check if we have exhausted our max attempts
if attempt < max_attempts - 1:
# Calculate the delay and wait
@@ -1820,6 +1825,82 @@ def perform_completion_with_backoff(
# ]
+async def aperform_completion_with_backoff(
+ provider,
+ prompt_with_variables,
+ api_token,
+ json_response=False,
+ base_url=None,
+ **kwargs,
+):
+ """
+ Async version: Perform an API completion request with exponential backoff.
+
+ How it works:
+ 1. Sends an async completion request to the API.
+ 2. Retries on rate-limit errors with exponential delays (async).
+ 3. Returns the API response or an error after all retries.
+
+ Args:
+ provider (str): The name of the API provider.
+ prompt_with_variables (str): The input prompt for the completion request.
+ api_token (str): The API token for authentication.
+ json_response (bool): Whether to request a JSON response. Defaults to False.
+ base_url (Optional[str]): The base URL for the API. Defaults to None.
+ **kwargs: Additional arguments for the API request.
+
+ Returns:
+ dict: The API response or an error message after all retries.
+ """
+
+ from litellm import acompletion
+ from litellm.exceptions import RateLimitError
+ import asyncio
+
+ max_attempts = 3
+ base_delay = 2 # Base delay in seconds, you can adjust this based on your needs
+
+ extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
+ if json_response:
+ extra_args["response_format"] = {"type": "json_object"}
+
+ if kwargs.get("extra_args"):
+ extra_args.update(kwargs["extra_args"])
+
+ for attempt in range(max_attempts):
+ try:
+ response = await acompletion(
+ model=provider,
+ messages=[{"role": "user", "content": prompt_with_variables}],
+ **extra_args,
+ )
+ return response # Return the successful response
+ except RateLimitError as e:
+ print("Rate limit error:", str(e))
+
+ if attempt == max_attempts - 1:
+ # Last attempt failed, raise the error.
+ raise
+
+ # Check if we have exhausted our max attempts
+ if attempt < max_attempts - 1:
+ # Calculate the delay and wait
+ delay = base_delay * (2**attempt) # Exponential backoff formula
+ print(f"Waiting for {delay} seconds before retrying...")
+ await asyncio.sleep(delay)
+ else:
+ # Return an error response after exhausting all retries
+ return [
+ {
+ "index": 0,
+ "tags": ["error"],
+ "content": ["Rate limit error. Please try again later."],
+ }
+ ]
+ except Exception as e:
+ raise e # Raise any other exceptions immediately
+
+
def extract_blocks(url, html, provider=DEFAULT_PROVIDER, api_token=None, base_url=None):
"""
Extract content blocks from website HTML using an AI provider.
@@ -2146,7 +2227,9 @@ def normalize_url(
drop_query_tracking=True,
sort_query=True,
keep_fragment=False,
- extra_drop_params=None
+ extra_drop_params=None,
+ preserve_https=False,
+ original_scheme=None
):
"""
Extended URL normalizer
@@ -2176,6 +2259,17 @@ def normalize_url(
# Resolve relative paths first
full_url = urljoin(base_url, href.strip())
+
+ # Preserve HTTPS if requested and original scheme was HTTPS
+ if preserve_https and original_scheme == 'https':
+ parsed_full = urlparse(full_url)
+ parsed_base = urlparse(base_url)
+ # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
+ # Protocol-relative URLs (//example.com) should follow the base URL's scheme
+ if (parsed_full.scheme == 'http' and
+ parsed_full.netloc == parsed_base.netloc and
+ not href.strip().startswith('//')):
+ full_url = full_url.replace('http://', 'https://', 1)
# Parse once, edit parts, then rebuild
parsed = urlparse(full_url)
@@ -2184,8 +2278,10 @@ def normalize_url(
netloc = parsed.netloc.lower()
# ── path ──
- # Strip duplicate slashes and trailing “/” (except root)
- path = quote(unquote(parsed.path))
+ # Strip duplicate slashes and trailing "/" (except root)
+ # IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs
+ # The path from urlparse is already properly encoded
+ path = parsed.path
if path.endswith('/') and path != '/':
path = path.rstrip('/')
@@ -2225,7 +2321,7 @@ def normalize_url(
return normalized
-def normalize_url_for_deep_crawl(href, base_url):
+def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
"""Normalize URLs to ensure consistent format"""
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
@@ -2236,6 +2332,17 @@ def normalize_url_for_deep_crawl(href, base_url):
# Use urljoin to handle relative URLs
full_url = urljoin(base_url, href.strip())
+ # Preserve HTTPS if requested and original scheme was HTTPS
+ if preserve_https and original_scheme == 'https':
+ parsed_full = urlparse(full_url)
+ parsed_base = urlparse(base_url)
+ # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
+ # Protocol-relative URLs (//example.com) should follow the base URL's scheme
+ if (parsed_full.scheme == 'http' and
+ parsed_full.netloc == parsed_base.netloc and
+ not href.strip().startswith('//')):
+ full_url = full_url.replace('http://', 'https://', 1)
+
# Parse the URL for normalization
parsed = urlparse(full_url)
@@ -2273,7 +2380,7 @@ def normalize_url_for_deep_crawl(href, base_url):
return normalized
@lru_cache(maxsize=10000)
-def efficient_normalize_url_for_deep_crawl(href, base_url):
+def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
"""Efficient URL normalization with proper parsing"""
from urllib.parse import urljoin
@@ -2283,6 +2390,17 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
# Resolve relative URLs
full_url = urljoin(base_url, href.strip())
+ # Preserve HTTPS if requested and original scheme was HTTPS
+ if preserve_https and original_scheme == 'https':
+ parsed_full = urlparse(full_url)
+ parsed_base = urlparse(base_url)
+ # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
+ # Protocol-relative URLs (//example.com) should follow the base URL's scheme
+ if (parsed_full.scheme == 'http' and
+ parsed_full.netloc == parsed_base.netloc and
+ not href.strip().startswith('//')):
+ full_url = full_url.replace('http://', 'https://', 1)
+
# Use proper URL parsing
parsed = urlparse(full_url)
@@ -3488,4 +3606,52 @@ def get_memory_stats() -> Tuple[float, float, float]:
available_gb = get_true_available_memory_gb()
used_percent = get_true_memory_usage_percent()
- return used_percent, available_gb, total_gb
\ No newline at end of file
+ return used_percent, available_gb, total_gb
+
+
+# Hook utilities for Docker API
+def hooks_to_string(hooks: Dict[str, Callable]) -> Dict[str, str]:
+ """
+ Convert hook function objects to string representations for Docker API.
+
+ This utility simplifies the process of using hooks with the Docker API by converting
+ Python function objects into the string format required by the API.
+
+ Args:
+ hooks: Dictionary mapping hook point names to Python function objects.
+ Functions should be async and follow hook signature requirements.
+
+ Returns:
+ Dictionary mapping hook point names to string representations of the functions.
+
+ Example:
+ >>> async def my_hook(page, context, **kwargs):
+ ... await page.set_viewport_size({"width": 1920, "height": 1080})
+ ... return page
+ >>>
+ >>> hooks_dict = {"on_page_context_created": my_hook}
+ >>> api_hooks = hooks_to_string(hooks_dict)
+ >>> # api_hooks is now ready to use with Docker API
+
+ Raises:
+ ValueError: If a hook is not callable or source cannot be extracted
+ """
+ result = {}
+
+ for hook_name, hook_func in hooks.items():
+ if not callable(hook_func):
+ raise ValueError(f"Hook '{hook_name}' must be a callable function, got {type(hook_func)}")
+
+ try:
+ # Get the source code of the function
+ source = inspect.getsource(hook_func)
+ # Remove any leading indentation to get clean source
+ source = textwrap.dedent(source)
+ result[hook_name] = source
+ except (OSError, TypeError) as e:
+ raise ValueError(
+ f"Cannot extract source code for hook '{hook_name}'. "
+ f"Make sure the function is defined in a file (not interactively). Error: {e}"
+ )
+
+ return result
diff --git a/deploy/docker/.llm.env.example b/deploy/docker/.llm.env.example
index 254002f4..012435d8 100644
--- a/deploy/docker/.llm.env.example
+++ b/deploy/docker/.llm.env.example
@@ -10,4 +10,23 @@ GEMINI_API_TOKEN=your_gemini_key_here
# Optional: Override the default LLM provider
# Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
# If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
-# LLM_PROVIDER=anthropic/claude-3-opus
\ No newline at end of file
+# LLM_PROVIDER=anthropic/claude-3-opus
+
+# Optional: Global LLM temperature setting (0.0-2.0)
+# Controls randomness in responses. Lower = more focused, Higher = more creative
+# LLM_TEMPERATURE=0.7
+
+# Optional: Global custom API base URL
+# Use this to point to custom endpoints or proxy servers
+# LLM_BASE_URL=https://api.custom.com/v1
+
+# Optional: Provider-specific temperature overrides
+# These take precedence over the global LLM_TEMPERATURE
+# OPENAI_TEMPERATURE=0.5
+# ANTHROPIC_TEMPERATURE=0.3
+# GROQ_TEMPERATURE=0.8
+
+# Optional: Provider-specific base URL overrides
+# Use for provider-specific proxy endpoints
+# OPENAI_BASE_URL=https://custom-openai.company.com/v1
+# GROQ_BASE_URL=https://custom-groq.company.com/v1
\ No newline at end of file
diff --git a/deploy/docker/ARCHITECTURE.md b/deploy/docker/ARCHITECTURE.md
new file mode 100644
index 00000000..eb49cdae
--- /dev/null
+++ b/deploy/docker/ARCHITECTURE.md
@@ -0,0 +1,1149 @@
+# Crawl4AI Docker Server - Technical Architecture
+
+**Version**: 0.7.4
+**Last Updated**: October 2025
+**Status**: Production-ready with real-time monitoring
+
+This document provides a comprehensive technical overview of the Crawl4AI Docker server architecture, including the smart browser pool, real-time monitoring system, and all production optimizations.
+
+---
+
+## Table of Contents
+
+1. [System Overview](#system-overview)
+2. [Core Components](#core-components)
+3. [Smart Browser Pool](#smart-browser-pool)
+4. [Real-time Monitoring System](#real-time-monitoring-system)
+5. [API Layer](#api-layer)
+6. [Memory Management](#memory-management)
+7. [Production Optimizations](#production-optimizations)
+8. [Deployment & Operations](#deployment--operations)
+9. [Troubleshooting & Debugging](#troubleshooting--debugging)
+
+---
+
+## System Overview
+
+### Architecture Diagram
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ Client Requests │
+└────────────┬────────────────────────────────────────────────┘
+ │
+ ▼
+┌─────────────────────────────────────────────────────────────┐
+│ FastAPI Server (server.py) │
+│ ├─ REST API Endpoints (/crawl, /html, /md, /llm, etc.) │
+│ ├─ WebSocket Endpoint (/monitor/ws) │
+│ └─ Background Tasks (janitor, timeline_updater) │
+└────┬────────────────────┬────────────────────┬──────────────┘
+ │ │ │
+ ▼ ▼ ▼
+┌─────────────┐ ┌──────────────────┐ ┌─────────────────┐
+│ Browser │ │ Monitor System │ │ Redis │
+│ Pool │ │ (monitor.py) │ │ (Persistence) │
+│ │ │ │ │ │
+│ PERMANENT ●─┤ │ ├─ Stats │ │ ├─ Endpoint │
+│ HOT_POOL ♨─┤ │ ├─ Requests │ │ │ Stats │
+│ COLD_POOL ❄─┤ │ ├─ Browsers │ │ ├─ Task │
+│ │ │ ├─ Timeline │ │ │ Results │
+│ Janitor 🧹─┤ │ └─ Events/Errors │ │ └─ Cache │
+└─────────────┘ └──────────────────┘ └─────────────────┘
+```
+
+### Key Features
+
+- **10x Memory Efficiency**: Smart 3-tier browser pooling reduces memory from 500-700MB to 50-70MB per concurrent user
+- **Real-time Monitoring**: WebSocket-based live dashboard with 2-second update intervals
+- **Production-Ready**: Comprehensive error handling, timeouts, cleanup, and graceful shutdown
+- **Container-Aware**: Accurate memory detection using cgroup v2/v1
+- **Auto-Recovery**: Graceful WebSocket fallback, lock protection, background workers
+
+---
+
+## Core Components
+
+### 1. Server Core (`server.py`)
+
+**Responsibilities:**
+- FastAPI application lifecycle management
+- Route registration and middleware
+- Background task orchestration
+- Graceful shutdown handling
+
+**Key Functions:**
+
+```python
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+ """Application lifecycle manager"""
+ # Startup
+ - Initialize Redis connection
+ - Create monitor stats instance
+ - Start persistence worker
+ - Initialize permanent browser
+ - Start janitor (browser cleanup)
+ - Start timeline updater (5s interval)
+
+ yield
+
+ # Shutdown
+ - Cancel background tasks
+ - Persist final monitor stats
+ - Stop persistence worker
+ - Close all browsers
+```
+
+**Configuration:**
+- Loaded from `config.yml`
+- Browser settings, memory thresholds, rate limiting
+- LLM provider credentials
+- Server host/port
+
+### 2. API Layer (`api.py`)
+
+**Endpoints:**
+
+| Endpoint | Method | Purpose | Pool Usage |
+|----------|--------|---------|------------|
+| `/health` | GET | Health check | None |
+| `/crawl` | POST | Full crawl with all features | ✓ Pool |
+| `/crawl_stream` | POST | Streaming crawl results | ✓ Pool |
+| `/html` | POST | HTML extraction | ✓ Pool |
+| `/md` | POST | Markdown generation | ✓ Pool |
+| `/screenshot` | POST | Page screenshots | ✓ Pool |
+| `/pdf` | POST | PDF generation | ✓ Pool |
+| `/llm/{path}` | GET/POST | LLM extraction | ✓ Pool |
+| `/crawl/job` | POST | Background job creation | ✓ Pool |
+
+**Request Flow:**
+
+```python
+@app.post("/crawl")
+async def crawl(body: CrawlRequest):
+ # 1. Track request start
+ request_id = f"req_{uuid4().hex[:8]}"
+ await get_monitor().track_request_start(request_id, "/crawl", url, config)
+
+ # 2. Get browser from pool
+ from crawler_pool import get_crawler
+ crawler = await get_crawler(browser_config)
+
+ # 3. Execute crawl
+ result = await crawler.arun(url, config=crawler_config)
+
+ # 4. Track request completion
+ await get_monitor().track_request_end(request_id, success=True)
+
+ # 5. Return result (browser stays in pool)
+ return result
+```
+
+### 3. Utility Layer (`utils.py`)
+
+**Container Memory Detection:**
+
+```python
+def get_container_memory_percent() -> float:
+ """Accurate container memory detection"""
+ try:
+ # Try cgroup v2 first
+ current = int(Path("/sys/fs/cgroup/memory.current").read_text().strip())
+ max_mem = int(Path("/sys/fs/cgroup/memory.max").read_text().strip())
+ return (current / max_mem) * 100
+ except:
+ # Fallback to cgroup v1
+ usage = int(Path("/sys/fs/cgroup/memory/memory.usage_in_bytes").read_text())
+ limit = int(Path("/sys/fs/cgroup/memory/memory.limit_in_bytes").read_text())
+ return (usage / limit) * 100
+ except:
+ # Final fallback to psutil (may be inaccurate in containers)
+ return psutil.virtual_memory().percent
+```
+
+**Helper Functions:**
+- `get_base_url()`: Request base URL extraction
+- `is_task_id()`: Task ID validation
+- `should_cleanup_task()`: TTL-based cleanup logic
+- `validate_llm_provider()`: LLM configuration validation
+
+---
+
+## Smart Browser Pool
+
+### Architecture
+
+The browser pool implements a 3-tier strategy optimized for real-world usage patterns:
+
+```
+┌──────────────────────────────────────────────────────────┐
+│ PERMANENT Browser (Default Config) │
+│ ● Always alive, never cleaned │
+│ ● Serves 90% of requests │
+│ ● ~270MB memory │
+└──────────────────────────────────────────────────────────┘
+ ▲
+ │ 90% of requests
+ │
+┌──────────────────────────────────────────────────────────┐
+│ HOT_POOL (Frequently Used Configs) │
+│ ♨ Configs used 3+ times │
+│ ♨ Longer TTL (2-5 min depending on memory) │
+│ ♨ ~180MB per browser │
+└──────────────────────────────────────────────────────────┘
+ ▲
+ │ Promotion at 3 uses
+ │
+┌──────────────────────────────────────────────────────────┐
+│ COLD_POOL (Rarely Used Configs) │
+│ ❄ New/rare browser configs │
+│ ❄ Short TTL (30s-5min depending on memory) │
+│ ❄ ~180MB per browser │
+└──────────────────────────────────────────────────────────┘
+```
+
+### Implementation (`crawler_pool.py`)
+
+**Core Data Structures:**
+
+```python
+PERMANENT: Optional[AsyncWebCrawler] = None # Default browser
+HOT_POOL: Dict[str, AsyncWebCrawler] = {} # Frequent configs
+COLD_POOL: Dict[str, AsyncWebCrawler] = {} # Rare configs
+LAST_USED: Dict[str, float] = {} # Timestamp tracking
+USAGE_COUNT: Dict[str, int] = {} # Usage counter
+LOCK = asyncio.Lock() # Thread-safe access
+```
+
+**Browser Acquisition Flow:**
+
+```python
+async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
+ sig = _sig(cfg) # SHA1 hash of config
+
+ async with LOCK: # Prevent race conditions
+ # 1. Check permanent browser
+ if _is_default_config(sig):
+ return PERMANENT
+
+ # 2. Check hot pool
+ if sig in HOT_POOL:
+ USAGE_COUNT[sig] += 1
+ return HOT_POOL[sig]
+
+ # 3. Check cold pool (with promotion logic)
+ if sig in COLD_POOL:
+ USAGE_COUNT[sig] += 1
+ if USAGE_COUNT[sig] >= 3:
+ # Promote to hot pool
+ HOT_POOL[sig] = COLD_POOL.pop(sig)
+ await get_monitor().track_janitor_event("promote", sig, {...})
+ return HOT_POOL[sig]
+ return COLD_POOL[sig]
+
+ # 4. Memory check before creating new
+ if get_container_memory_percent() >= MEM_LIMIT:
+ raise MemoryError(f"Memory at {mem}%, refusing new browser")
+
+ # 5. Create new browser in cold pool
+ crawler = AsyncWebCrawler(config=cfg)
+ await crawler.start()
+ COLD_POOL[sig] = crawler
+ return crawler
+```
+
+**Janitor (Adaptive Cleanup):**
+
+```python
+async def janitor():
+ """Memory-adaptive browser cleanup"""
+ while True:
+ mem_pct = get_container_memory_percent()
+
+ # Adaptive intervals based on memory pressure
+ if mem_pct > 80:
+ interval, cold_ttl, hot_ttl = 10, 30, 120 # Aggressive
+ elif mem_pct > 60:
+ interval, cold_ttl, hot_ttl = 30, 60, 300 # Moderate
+ else:
+ interval, cold_ttl, hot_ttl = 60, 300, 600 # Relaxed
+
+ await asyncio.sleep(interval)
+
+ async with LOCK:
+ # Clean cold pool first (less valuable)
+ for sig in list(COLD_POOL.keys()):
+ if now - LAST_USED[sig] > cold_ttl:
+ await COLD_POOL[sig].close()
+ del COLD_POOL[sig], LAST_USED[sig], USAGE_COUNT[sig]
+ await track_janitor_event("close_cold", sig, {...})
+
+ # Clean hot pool (more conservative)
+ for sig in list(HOT_POOL.keys()):
+ if now - LAST_USED[sig] > hot_ttl:
+ await HOT_POOL[sig].close()
+ del HOT_POOL[sig], LAST_USED[sig], USAGE_COUNT[sig]
+ await track_janitor_event("close_hot", sig, {...})
+```
+
+**Config Signature Generation:**
+
+```python
+def _sig(cfg: BrowserConfig) -> str:
+ """Generate unique signature for browser config"""
+ payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
+ return hashlib.sha1(payload.encode()).hexdigest()
+```
+
+---
+
+## Real-time Monitoring System
+
+### Architecture
+
+The monitoring system provides real-time insights via WebSocket with automatic fallback to HTTP polling.
+
+**Components:**
+
+```
+┌─────────────────────────────────────────────────────────┐
+│ MonitorStats Class (monitor.py) │
+│ ├─ In-memory queues (deques with maxlen) │
+│ ├─ Background persistence worker │
+│ ├─ Timeline tracking (5-min window, 5s resolution) │
+│ └─ Time-based expiry (5min for old entries) │
+└───────────┬─────────────────────────────────────────────┘
+ │
+ ▼
+┌─────────────────────────────────────────────────────────┐
+│ WebSocket Endpoint (/monitor/ws) │
+│ ├─ 2-second update intervals │
+│ ├─ Auto-reconnect with exponential backoff │
+│ ├─ Comprehensive data payload │
+│ └─ Graceful fallback to polling │
+└───────────┬─────────────────────────────────────────────┘
+ │
+ ▼
+┌─────────────────────────────────────────────────────────┐
+│ Dashboard UI (static/monitor/index.html) │
+│ ├─ Connection status indicator │
+│ ├─ Live updates (health, requests, browsers) │
+│ ├─ Timeline charts (memory, requests, browsers) │
+│ └─ Janitor events & error logs │
+└─────────────────────────────────────────────────────────┘
+```
+
+### Monitor Stats (`monitor.py`)
+
+**Data Structures:**
+
+```python
+class MonitorStats:
+ # In-memory queues
+ active_requests: Dict[str, Dict] # Currently processing
+ completed_requests: deque(maxlen=100) # Last 100 completed
+ janitor_events: deque(maxlen=100) # Cleanup events
+ errors: deque(maxlen=100) # Error log
+
+ # Endpoint stats (persisted to Redis)
+ endpoint_stats: Dict[str, Dict] # Aggregated stats
+
+ # Timeline data (5min window, 5s resolution = 60 points)
+ memory_timeline: deque(maxlen=60)
+ requests_timeline: deque(maxlen=60)
+ browser_timeline: deque(maxlen=60)
+
+ # Background persistence
+ _persist_queue: asyncio.Queue(maxsize=10)
+ _persist_worker_task: Optional[asyncio.Task]
+```
+
+**Request Tracking:**
+
+```python
+async def track_request_start(request_id, endpoint, url, config):
+ """Track new request"""
+ self.active_requests[request_id] = {
+ "id": request_id,
+ "endpoint": endpoint,
+ "url": url,
+ "start_time": time.time(),
+ "mem_start": psutil.Process().memory_info().rss / (1024 * 1024)
+ }
+
+ # Update endpoint stats
+ if endpoint not in self.endpoint_stats:
+ self.endpoint_stats[endpoint] = {
+ "count": 0, "total_time": 0, "errors": 0,
+ "pool_hits": 0, "success": 0
+ }
+ self.endpoint_stats[endpoint]["count"] += 1
+
+ # Queue background persistence
+ self._persist_queue.put_nowait(True)
+
+async def track_request_end(request_id, success, error=None, ...):
+ """Track request completion"""
+ req_info = self.active_requests.pop(request_id)
+ elapsed = time.time() - req_info["start_time"]
+ mem_delta = current_mem - req_info["mem_start"]
+
+ # Add to completed queue
+ self.completed_requests.append({
+ "id": request_id,
+ "endpoint": req_info["endpoint"],
+ "url": req_info["url"],
+ "success": success,
+ "elapsed": elapsed,
+ "mem_delta": mem_delta,
+ "end_time": time.time()
+ })
+
+ # Update stats
+ self.endpoint_stats[endpoint]["success" if success else "errors"] += 1
+ await self._persist_endpoint_stats()
+```
+
+**Background Persistence Worker:**
+
+```python
+async def _persistence_worker(self):
+ """Background worker for Redis persistence"""
+ while True:
+ try:
+ await self._persist_queue.get()
+ await self._persist_endpoint_stats()
+ self._persist_queue.task_done()
+ except asyncio.CancelledError:
+ break
+ except Exception as e:
+ logger.error(f"Persistence worker error: {e}")
+
+async def _persist_endpoint_stats(self):
+ """Persist stats to Redis with error handling"""
+ try:
+ await self.redis.set(
+ "monitor:endpoint_stats",
+ json.dumps(self.endpoint_stats),
+ ex=86400 # 24h TTL
+ )
+ except Exception as e:
+ logger.warning(f"Failed to persist endpoint stats: {e}")
+```
+
+**Time-based Cleanup:**
+
+```python
+def _cleanup_old_entries(self, max_age_seconds=300):
+ """Remove entries older than 5 minutes"""
+ now = time.time()
+ cutoff = now - max_age_seconds
+
+ # Clean completed requests
+ while self.completed_requests and \
+ self.completed_requests[0].get("end_time", 0) < cutoff:
+ self.completed_requests.popleft()
+
+ # Clean janitor events
+ while self.janitor_events and \
+ self.janitor_events[0].get("timestamp", 0) < cutoff:
+ self.janitor_events.popleft()
+
+ # Clean errors
+ while self.errors and \
+ self.errors[0].get("timestamp", 0) < cutoff:
+ self.errors.popleft()
+```
+
+### WebSocket Implementation (`monitor_routes.py`)
+
+**Endpoint:**
+
+```python
+@router.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+ """Real-time monitoring updates"""
+ await websocket.accept()
+ logger.info("WebSocket client connected")
+
+ try:
+ while True:
+ try:
+ monitor = get_monitor()
+
+ # Gather comprehensive monitoring data
+ data = {
+ "timestamp": time.time(),
+ "health": await monitor.get_health_summary(),
+ "requests": {
+ "active": monitor.get_active_requests(),
+ "completed": monitor.get_completed_requests(limit=10)
+ },
+ "browsers": await monitor.get_browser_list(),
+ "timeline": {
+ "memory": monitor.get_timeline_data("memory", "5m"),
+ "requests": monitor.get_timeline_data("requests", "5m"),
+ "browsers": monitor.get_timeline_data("browsers", "5m")
+ },
+ "janitor": monitor.get_janitor_log(limit=10),
+ "errors": monitor.get_errors_log(limit=10)
+ }
+
+ await websocket.send_json(data)
+ await asyncio.sleep(2) # 2-second update interval
+
+ except WebSocketDisconnect:
+ logger.info("WebSocket client disconnected")
+ break
+ except Exception as e:
+ logger.error(f"WebSocket error: {e}", exc_info=True)
+ await asyncio.sleep(2)
+ except Exception as e:
+ logger.error(f"WebSocket connection error: {e}", exc_info=True)
+ finally:
+ logger.info("WebSocket connection closed")
+```
+
+**Input Validation:**
+
+```python
+@router.get("/requests")
+async def get_requests(status: str = "all", limit: int = 50):
+ # Input validation
+ if status not in ["all", "active", "completed", "success", "error"]:
+ raise HTTPException(400, f"Invalid status: {status}")
+ if limit < 1 or limit > 1000:
+ raise HTTPException(400, f"Invalid limit: {limit}")
+
+ monitor = get_monitor()
+ # ... return data
+```
+
+### Frontend Dashboard
+
+**Connection Management:**
+
+```javascript
+// WebSocket with auto-reconnect
+function connectWebSocket() {
+ if (wsReconnectAttempts >= MAX_WS_RECONNECT) {
+ // Fallback to polling after 5 failed attempts
+ useWebSocket = false;
+ updateConnectionStatus('polling');
+ startAutoRefresh();
+ return;
+ }
+
+ updateConnectionStatus('connecting');
+ const wsUrl = `${protocol}//${window.location.host}/monitor/ws`;
+ websocket = new WebSocket(wsUrl);
+
+ websocket.onopen = () => {
+ wsReconnectAttempts = 0;
+ updateConnectionStatus('connected');
+ stopAutoRefresh(); // Stop polling
+ };
+
+ websocket.onmessage = (event) => {
+ const data = JSON.parse(event.data);
+ updateDashboard(data); // Update all sections
+ };
+
+ websocket.onclose = () => {
+ updateConnectionStatus('disconnected', 'Reconnecting...');
+ if (useWebSocket) {
+ setTimeout(connectWebSocket, 2000 * wsReconnectAttempts);
+ } else {
+ startAutoRefresh(); // Fallback to polling
+ }
+ };
+}
+```
+
+**Connection Status Indicator:**
+
+| Status | Color | Animation | Meaning |
+|--------|-------|-----------|---------|
+| Live | Green | Pulsing fast | WebSocket connected |
+| Connecting... | Yellow | Pulsing slow | Attempting connection |
+| Polling | Blue | Pulsing slow | HTTP polling fallback |
+| Disconnected | Red | None | Connection failed |
+
+---
+
+## API Layer
+
+### Request/Response Flow
+
+```
+Client Request
+ │
+ ▼
+FastAPI Route Handler
+ │
+ ├─→ Monitor: track_request_start()
+ │
+ ├─→ Browser Pool: get_crawler(config)
+ │ │
+ │ ├─→ Check PERMANENT
+ │ ├─→ Check HOT_POOL
+ │ ├─→ Check COLD_POOL
+ │ └─→ Create New (if needed)
+ │
+ ├─→ Execute Crawl
+ │ │
+ │ ├─→ Fetch page
+ │ ├─→ Extract content
+ │ ├─→ Apply filters/strategies
+ │ └─→ Return result
+ │
+ ├─→ Monitor: track_request_end()
+ │
+ └─→ Return Response (browser stays in pool)
+```
+
+### Error Handling Strategy
+
+**Levels:**
+
+1. **Route Level**: HTTP exceptions with proper status codes
+2. **Monitor Level**: Try-except with logging, non-critical failures
+3. **Pool Level**: Memory checks, lock protection, graceful degradation
+4. **WebSocket Level**: Auto-reconnect, fallback to polling
+
+**Example:**
+
+```python
+@app.post("/crawl")
+async def crawl(body: CrawlRequest):
+ request_id = f"req_{uuid4().hex[:8]}"
+
+ try:
+ # Monitor tracking (non-blocking on failure)
+ try:
+ await get_monitor().track_request_start(...)
+ except:
+ pass # Monitor not critical
+
+ # Browser acquisition (with memory protection)
+ crawler = await get_crawler(browser_config)
+
+ # Crawl execution
+ result = await crawler.arun(url, config=cfg)
+
+ # Success tracking
+ try:
+ await get_monitor().track_request_end(request_id, success=True)
+ except:
+ pass
+
+ return result
+
+ except MemoryError as e:
+ # Memory pressure - return 503
+ await get_monitor().track_request_end(request_id, success=False, error=str(e))
+ raise HTTPException(503, "Server at capacity")
+ except Exception as e:
+ # General errors - return 500
+ await get_monitor().track_request_end(request_id, success=False, error=str(e))
+ raise HTTPException(500, str(e))
+```
+
+---
+
+## Memory Management
+
+### Container Memory Detection
+
+**Priority Order:**
+1. cgroup v2 (`/sys/fs/cgroup/memory.{current,max}`)
+2. cgroup v1 (`/sys/fs/cgroup/memory/memory.{usage,limit}_in_bytes`)
+3. psutil fallback (may be inaccurate in containers)
+
+**Usage:**
+
+```python
+mem_pct = get_container_memory_percent()
+
+if mem_pct >= 95: # Critical
+ raise MemoryError("Refusing new browser")
+elif mem_pct > 80: # High pressure
+ # Janitor: aggressive cleanup (10s interval, 30s TTL)
+elif mem_pct > 60: # Moderate pressure
+ # Janitor: moderate cleanup (30s interval, 60s TTL)
+else: # Normal
+ # Janitor: relaxed cleanup (60s interval, 300s TTL)
+```
+
+### Memory Budgets
+
+| Component | Memory | Notes |
+|-----------|--------|-------|
+| Base Container | 270 MB | Python + FastAPI + libraries |
+| Permanent Browser | 270 MB | Always-on default browser |
+| Hot Pool Browser | 180 MB | Per frequently-used config |
+| Cold Pool Browser | 180 MB | Per rarely-used config |
+| Active Crawl Overhead | 50-200 MB | Temporary, released after request |
+
+**Example Calculation:**
+
+```
+Container: 270 MB
+Permanent: 270 MB
+2x Hot: 360 MB
+1x Cold: 180 MB
+Total: 1080 MB baseline
+
+Under load (10 concurrent):
++ Active crawls: ~500-1000 MB
+= Peak: 1.5-2 GB
+```
+
+---
+
+## Production Optimizations
+
+### Code Review Fixes Applied
+
+**Critical (3):**
+1. ✅ Lock protection for browser pool access
+2. ✅ Async track_janitor_event implementation
+3. ✅ Error handling in request tracking
+
+**Important (8):**
+4. ✅ Background persistence worker (replaces fire-and-forget)
+5. ✅ Time-based expiry (5min cleanup for old entries)
+6. ✅ Input validation (status, limit, metric, window)
+7. ✅ Timeline updater timeout (4s max)
+8. ✅ Warn when killing browsers with active requests
+9. ✅ Monitor cleanup on shutdown
+10. ✅ Document memory estimates
+11. ✅ Structured error responses (HTTPException)
+
+### Performance Characteristics
+
+**Latency:**
+
+| Scenario | Time | Notes |
+|----------|------|-------|
+| Pool Hit (Permanent) | <100ms | Browser ready |
+| Pool Hit (Hot/Cold) | <100ms | Browser ready |
+| New Browser Creation | 3-5s | Chromium startup |
+| Simple Page Fetch | 1-3s | Network + render |
+| Complex Extraction | 5-10s | LLM processing |
+
+**Throughput:**
+
+| Load | Concurrent | Response Time | Success Rate |
+|------|-----------|---------------|--------------|
+| Light | 1-10 | <3s | 100% |
+| Medium | 10-50 | 3-8s | 100% |
+| Heavy | 50-100 | 8-15s | 95-100% |
+| Extreme | 100+ | 15-30s | 80-95% |
+
+### Reliability Features
+
+**Race Condition Protection:**
+- `asyncio.Lock` on all pool operations
+- Lock on browser pool stats access
+- Async janitor event tracking
+
+**Graceful Degradation:**
+- WebSocket → HTTP polling fallback
+- Redis persistence failures (logged, non-blocking)
+- Monitor tracking failures (logged, non-blocking)
+
+**Resource Cleanup:**
+- Janitor cleanup (adaptive intervals)
+- Time-based expiry (5min for old data)
+- Shutdown cleanup (persist final stats, close browsers)
+- Background worker cancellation
+
+---
+
+## Deployment & Operations
+
+### Running Locally
+
+```bash
+# Install dependencies
+pip install -r requirements.txt
+
+# Configure
+cp .llm.env.example .llm.env
+# Edit .llm.env with your API keys
+
+# Run server
+python -m uvicorn server:app --host 0.0.0.0 --port 11235 --reload
+```
+
+### Docker Deployment
+
+```bash
+# Build image
+docker build -t crawl4ai:latest -f Dockerfile .
+
+# Run container
+docker run -d \
+ --name crawl4ai \
+ -p 11235:11235 \
+ --shm-size=1g \
+ --env-file .llm.env \
+ crawl4ai:latest
+```
+
+### Production Configuration
+
+**`config.yml` Key Settings:**
+
+```yaml
+crawler:
+ browser:
+ extra_args:
+ - "--disable-gpu"
+ - "--disable-dev-shm-usage"
+ - "--no-sandbox"
+ kwargs:
+ headless: true
+ text_mode: true # Reduces memory by 30-40%
+
+ memory_threshold_percent: 95 # Refuse new browsers above this
+
+ pool:
+ idle_ttl_sec: 300 # Base TTL for cold pool (5 min)
+
+ rate_limiter:
+ enabled: true
+ base_delay: [1.0, 3.0] # Random delay between requests
+```
+
+### Monitoring
+
+**Access Dashboard:**
+```
+http://localhost:11235/static/monitor/
+```
+
+**Check Logs:**
+```bash
+# All activity
+docker logs crawl4ai -f
+
+# Pool activity only
+docker logs crawl4ai | grep -E "(🔥|♨️|❄️|🆕|⬆️)"
+
+# Errors only
+docker logs crawl4ai | grep ERROR
+```
+
+**Metrics:**
+```bash
+# Container stats
+docker stats crawl4ai
+
+# Memory percentage
+curl http://localhost:11235/monitor/health | jq '.container.memory_percent'
+
+# Pool status
+curl http://localhost:11235/monitor/browsers | jq '.summary'
+```
+
+---
+
+## Troubleshooting & Debugging
+
+### Common Issues
+
+**1. WebSocket Not Connecting**
+
+Symptoms: Yellow "Connecting..." indicator, falls back to blue "Polling"
+
+Debug:
+```bash
+# Check server logs
+docker logs crawl4ai | grep WebSocket
+
+# Test WebSocket manually
+python test-websocket.py
+```
+
+Fix: Check firewall/proxy settings, ensure port 11235 accessible
+
+**2. High Memory Usage**
+
+Symptoms: Container OOM kills, 503 errors, slow responses
+
+Debug:
+```bash
+# Check current memory
+curl http://localhost:11235/monitor/health | jq '.container.memory_percent'
+
+# Check browser pool
+curl http://localhost:11235/monitor/browsers
+
+# Check janitor activity
+docker logs crawl4ai | grep "🧹"
+```
+
+Fix:
+- Lower `memory_threshold_percent` in config.yml
+- Increase container memory limit
+- Enable `text_mode: true` in browser config
+- Reduce idle_ttl_sec for more aggressive cleanup
+
+**3. Browser Pool Not Reusing**
+
+Symptoms: High "New Created" count, poor reuse rate
+
+Debug:
+```python
+# Check config signature matching
+from crawl4ai import BrowserConfig
+import json, hashlib
+
+cfg = BrowserConfig(...) # Your config
+sig = hashlib.sha1(json.dumps(cfg.to_dict(), sort_keys=True).encode()).hexdigest()
+print(f"Config signature: {sig[:8]}")
+```
+
+Check logs for permanent browser signature:
+```bash
+docker logs crawl4ai | grep "permanent"
+```
+
+Fix: Ensure endpoint configs match permanent browser config exactly
+
+**4. Janitor Not Cleaning Up**
+
+Symptoms: Memory stays high after idle period
+
+Debug:
+```bash
+# Check janitor events
+curl http://localhost:11235/monitor/logs/janitor
+
+# Check pool stats over time
+watch -n 5 'curl -s http://localhost:11235/monitor/browsers | jq ".summary"'
+```
+
+Fix:
+- Janitor runs every 10-60s depending on memory
+- Hot pool browsers have longer TTL (by design)
+- Permanent browser never cleaned (by design)
+
+### Debug Tools
+
+**Config Signature Checker:**
+
+```python
+from crawl4ai import BrowserConfig
+import json, hashlib
+
+def check_sig(cfg: BrowserConfig) -> str:
+ payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
+ sig = hashlib.sha1(payload.encode()).hexdigest()
+ return sig[:8]
+
+# Example
+cfg1 = BrowserConfig()
+cfg2 = BrowserConfig(headless=True)
+print(f"Default: {check_sig(cfg1)}")
+print(f"Custom: {check_sig(cfg2)}")
+```
+
+**Monitor Stats Dumper:**
+
+```bash
+#!/bin/bash
+# Dump all monitor stats to JSON
+
+curl -s http://localhost:11235/monitor/health > health.json
+curl -s http://localhost:11235/monitor/requests?limit=100 > requests.json
+curl -s http://localhost:11235/monitor/browsers > browsers.json
+curl -s http://localhost:11235/monitor/logs/janitor > janitor.json
+curl -s http://localhost:11235/monitor/logs/errors > errors.json
+
+echo "Monitor stats dumped to *.json files"
+```
+
+**WebSocket Test Script:**
+
+```python
+# test-websocket.py (included in repo)
+import asyncio
+import websockets
+import json
+
+async def test_websocket():
+ uri = "ws://localhost:11235/monitor/ws"
+ async with websockets.connect(uri) as websocket:
+ for i in range(5):
+ message = await websocket.recv()
+ data = json.loads(message)
+ print(f"\nUpdate #{i+1}:")
+ print(f" Health: CPU {data['health']['container']['cpu_percent']}%")
+ print(f" Active Requests: {len(data['requests']['active'])}")
+ print(f" Browsers: {len(data['browsers'])}")
+
+asyncio.run(test_websocket())
+```
+
+### Performance Tuning
+
+**For High Throughput:**
+
+```yaml
+# config.yml
+crawler:
+ memory_threshold_percent: 90 # Allow more browsers
+ pool:
+ idle_ttl_sec: 600 # Keep browsers longer
+ rate_limiter:
+ enabled: false # Disable for max speed
+```
+
+**For Low Memory:**
+
+```yaml
+# config.yml
+crawler:
+ browser:
+ kwargs:
+ text_mode: true # 30-40% memory reduction
+ memory_threshold_percent: 80 # More conservative
+ pool:
+ idle_ttl_sec: 60 # Aggressive cleanup
+```
+
+**For Stability:**
+
+```yaml
+# config.yml
+crawler:
+ memory_threshold_percent: 85 # Balanced
+ pool:
+ idle_ttl_sec: 300 # Moderate cleanup
+ rate_limiter:
+ enabled: true
+ base_delay: [2.0, 5.0] # Prevent rate limiting
+```
+
+---
+
+## Test Suite
+
+**Location:** `deploy/docker/tests/`
+
+**Tests:**
+
+1. `test_1_basic.py` - Health check, container lifecycle
+2. `test_2_memory.py` - Memory tracking, leak detection
+3. `test_3_pool.py` - Pool reuse validation
+4. `test_4_concurrent.py` - Concurrent load testing
+5. `test_5_pool_stress.py` - Multi-config pool behavior
+6. `test_6_multi_endpoint.py` - All endpoint validation
+7. `test_7_cleanup.py` - Janitor cleanup verification
+
+**Run All Tests:**
+
+```bash
+cd deploy/docker/tests
+pip install -r requirements.txt
+
+# Build image first
+cd /path/to/repo
+docker build -t crawl4ai-local:latest .
+
+# Run tests
+cd deploy/docker/tests
+for test in test_*.py; do
+ echo "Running $test..."
+ python $test || break
+done
+```
+
+---
+
+## Architecture Decision Log
+
+### Why 3-Tier Pool?
+
+**Decision:** PERMANENT + HOT_POOL + COLD_POOL
+
+**Rationale:**
+- 90% of requests use default config → permanent browser serves most traffic
+- Frequent variants (hot) deserve longer TTL for better reuse
+- Rare configs (cold) should be cleaned aggressively to save memory
+
+**Alternatives Considered:**
+- Single pool: Too simple, no optimization for common case
+- LRU cache: Doesn't capture "hot" vs "rare" distinction
+- Per-endpoint pools: Too complex, over-engineering
+
+### Why WebSocket + Polling Fallback?
+
+**Decision:** WebSocket primary, HTTP polling backup
+
+**Rationale:**
+- WebSocket provides real-time updates (2s interval)
+- Polling fallback ensures reliability in restricted networks
+- Auto-reconnect handles temporary disconnections
+
+**Alternatives Considered:**
+- Polling only: Works but higher latency, more server load
+- WebSocket only: Fails in restricted networks
+- Server-Sent Events: One-way, no client messages
+
+### Why Background Persistence Worker?
+
+**Decision:** Queue-based worker for Redis operations
+
+**Rationale:**
+- Fire-and-forget loses data on failures
+- Queue provides buffering and retry capability
+- Non-blocking keeps request path fast
+
+**Alternatives Considered:**
+- Synchronous writes: Blocks request handling
+- Fire-and-forget: Silent failures
+- Batch writes: Complex state management
+
+---
+
+## Contributing
+
+When modifying the architecture:
+
+1. **Maintain backward compatibility** in API contracts
+2. **Add tests** for new functionality
+3. **Update this document** with architectural changes
+4. **Profile memory impact** before production
+5. **Test under load** using the test suite
+
+**Code Review Checklist:**
+- [ ] Race conditions protected with locks
+- [ ] Error handling with proper logging
+- [ ] Graceful degradation on failures
+- [ ] Memory impact measured
+- [ ] Tests added/updated
+- [ ] Documentation updated
+
+---
+
+## License & Credits
+
+**Crawl4AI** - Created by Unclecode
+**GitHub**: https://github.com/unclecode/crawl4ai
+**License**: See LICENSE file in repository
+
+**Architecture & Optimizations**: October 2025
+**WebSocket Monitoring**: October 2025
+**Production Hardening**: October 2025
+
+---
+
+**End of Technical Architecture Document**
+
+For questions or issues, please open a GitHub issue at:
+https://github.com/unclecode/crawl4ai/issues
diff --git a/deploy/docker/README.md b/deploy/docker/README.md
index 49e0030b..6cf9c5bd 100644
--- a/deploy/docker/README.md
+++ b/deploy/docker/README.md
@@ -12,6 +12,7 @@
- [Python SDK](#python-sdk)
- [Understanding Request Schema](#understanding-request-schema)
- [REST API Examples](#rest-api-examples)
+ - [Asynchronous Jobs with Webhooks](#asynchronous-jobs-with-webhooks)
- [Additional API Endpoints](#additional-api-endpoints)
- [HTML Extraction Endpoint](#html-extraction-endpoint)
- [Screenshot Endpoint](#screenshot-endpoint)
@@ -58,15 +59,13 @@ Pull and run images directly from Docker Hub without building locally.
#### 1. Pull the Image
-Our latest release candidate is `0.7.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
-
-> ⚠️ **Important Note**: The `latest` tag currently points to the stable `0.6.0` version. After testing and validation, `0.7.0` (without -r1) will be released and `latest` will be updated. For now, please use `0.7.0-r1` to test the new features.
+Our latest stable release is `0.7.7`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
```bash
-# Pull the release candidate (for testing new features)
-docker pull unclecode/crawl4ai:0.7.0-r1
+# Pull the latest stable version (0.7.7)
+docker pull unclecode/crawl4ai:0.7.7
-# Or pull the current stable version (0.6.0)
+# Or use the latest tag (points to 0.7.7)
docker pull unclecode/crawl4ai:latest
```
@@ -101,7 +100,7 @@ EOL
-p 11235:11235 \
--name crawl4ai \
--shm-size=1g \
- unclecode/crawl4ai:0.7.0-r1
+ unclecode/crawl4ai:0.7.7
```
* **With LLM support:**
@@ -112,7 +111,7 @@ EOL
--name crawl4ai \
--env-file .llm.env \
--shm-size=1g \
- unclecode/crawl4ai:0.7.0-r1
+ unclecode/crawl4ai:0.7.7
```
> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
@@ -185,7 +184,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach
```bash
# Pulls and runs the release candidate from Docker Hub
# Automatically selects the correct architecture
- IMAGE=unclecode/crawl4ai:0.7.0-r1 docker compose up -d
+ IMAGE=unclecode/crawl4ai:0.7.7 docker compose up -d
```
* **Build and Run Locally:**
@@ -648,6 +647,194 @@ async def test_stream_crawl(token: str = None): # Made token optional
# asyncio.run(test_stream_crawl())
```
+### Asynchronous Jobs with Webhooks
+
+For long-running crawls or when you want to avoid keeping connections open, use the job queue endpoints. Instead of polling for results, configure a webhook to receive notifications when jobs complete.
+
+#### Why Use Jobs & Webhooks?
+
+- **No Polling Required** - Get notified when crawls complete instead of constantly checking status
+- **Better Resource Usage** - Free up client connections while jobs run in the background
+- **Scalable Architecture** - Ideal for high-volume crawling with TypeScript/Node.js clients or microservices
+- **Reliable Delivery** - Automatic retry with exponential backoff (5 attempts: 1s → 2s → 4s → 8s → 16s)
+
+#### How It Works
+
+1. **Submit Job** → POST to `/crawl/job` with optional `webhook_config`
+2. **Get Task ID** → Receive a `task_id` immediately
+3. **Job Runs** → Crawl executes in the background
+4. **Webhook Fired** → Server POSTs completion notification to your webhook URL
+5. **Fetch Results** → If data wasn't included in webhook, GET `/crawl/job/{task_id}`
+
+#### Quick Example
+
+```bash
+# Submit a crawl job with webhook notification
+curl -X POST http://localhost:11235/crawl/job \
+ -H "Content-Type: application/json" \
+ -d '{
+ "urls": ["https://example.com"],
+ "webhook_config": {
+ "webhook_url": "https://myapp.com/webhooks/crawl-complete",
+ "webhook_data_in_payload": false
+ }
+ }'
+
+# Response: {"task_id": "crawl_a1b2c3d4"}
+```
+
+**Your webhook receives:**
+```json
+{
+ "task_id": "crawl_a1b2c3d4",
+ "task_type": "crawl",
+ "status": "completed",
+ "timestamp": "2025-10-21T10:30:00.000000+00:00",
+ "urls": ["https://example.com"]
+}
+```
+
+Then fetch the results:
+```bash
+curl http://localhost:11235/crawl/job/crawl_a1b2c3d4
+```
+
+#### Include Data in Webhook
+
+Set `webhook_data_in_payload: true` to receive the full crawl results directly in the webhook:
+
+```bash
+curl -X POST http://localhost:11235/crawl/job \
+ -H "Content-Type: application/json" \
+ -d '{
+ "urls": ["https://example.com"],
+ "webhook_config": {
+ "webhook_url": "https://myapp.com/webhooks/crawl-complete",
+ "webhook_data_in_payload": true
+ }
+ }'
+```
+
+**Your webhook receives the complete data:**
+```json
+{
+ "task_id": "crawl_a1b2c3d4",
+ "task_type": "crawl",
+ "status": "completed",
+ "timestamp": "2025-10-21T10:30:00.000000+00:00",
+ "urls": ["https://example.com"],
+ "data": {
+ "markdown": "...",
+ "html": "...",
+ "links": {...},
+ "metadata": {...}
+ }
+}
+```
+
+#### Webhook Authentication
+
+Add custom headers for authentication:
+
+```json
+{
+ "urls": ["https://example.com"],
+ "webhook_config": {
+ "webhook_url": "https://myapp.com/webhooks/crawl",
+ "webhook_data_in_payload": false,
+ "webhook_headers": {
+ "X-Webhook-Secret": "your-secret-token",
+ "X-Service-ID": "crawl4ai-prod"
+ }
+ }
+}
+```
+
+#### Global Default Webhook
+
+Configure a default webhook URL in `config.yml` for all jobs:
+
+```yaml
+webhooks:
+ enabled: true
+ default_url: "https://myapp.com/webhooks/default"
+ data_in_payload: false
+ retry:
+ max_attempts: 5
+ initial_delay_ms: 1000
+ max_delay_ms: 32000
+ timeout_ms: 30000
+```
+
+Now jobs without `webhook_config` automatically use the default webhook.
+
+#### Job Status Polling (Without Webhooks)
+
+If you prefer polling instead of webhooks, just omit `webhook_config`:
+
+```bash
+# Submit job
+curl -X POST http://localhost:11235/crawl/job \
+ -H "Content-Type: application/json" \
+ -d '{"urls": ["https://example.com"]}'
+# Response: {"task_id": "crawl_xyz"}
+
+# Poll for status
+curl http://localhost:11235/crawl/job/crawl_xyz
+```
+
+The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`.
+
+#### LLM Extraction Jobs with Webhooks
+
+The same webhook system works for LLM extraction jobs via `/llm/job`:
+
+```bash
+# Submit LLM extraction job with webhook
+curl -X POST http://localhost:11235/llm/job \
+ -H "Content-Type: application/json" \
+ -d '{
+ "url": "https://example.com/article",
+ "q": "Extract the article title, author, and main points",
+ "provider": "openai/gpt-4o-mini",
+ "webhook_config": {
+ "webhook_url": "https://myapp.com/webhooks/llm-complete",
+ "webhook_data_in_payload": true,
+ "webhook_headers": {
+ "X-Webhook-Secret": "your-secret-token"
+ }
+ }
+ }'
+
+# Response: {"task_id": "llm_1234567890"}
+```
+
+**Your webhook receives:**
+```json
+{
+ "task_id": "llm_1234567890",
+ "task_type": "llm_extraction",
+ "status": "completed",
+ "timestamp": "2025-10-22T12:30:00.000000+00:00",
+ "urls": ["https://example.com/article"],
+ "data": {
+ "extracted_content": {
+ "title": "Understanding Web Scraping",
+ "author": "John Doe",
+ "main_points": ["Point 1", "Point 2", "Point 3"]
+ }
+ }
+}
+```
+
+**Key Differences for LLM Jobs:**
+- Task type is `"llm_extraction"` instead of `"crawl"`
+- Extracted data is in `data.extracted_content`
+- Single URL only (not an array)
+- Supports schema-based extraction with `schema` parameter
+
+> 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling.
+
---
## Metrics & Monitoring
@@ -692,8 +879,7 @@ app:
# Default LLM Configuration
llm:
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
- api_key_env: "OPENAI_API_KEY"
- # api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
+ # api_key: sk-... # If you pass the API key directly (not recommended)
# Redis Configuration (Used by internal Redis server managed by supervisord)
redis:
@@ -827,10 +1013,11 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
- Building and running the Docker container
-- Configuring the environment
+- Configuring the environment
- Using the interactive playground for testing
- Making API requests with proper typing
- Using the Python SDK
+- Asynchronous job queues with webhook notifications
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
- Connecting via the Model Context Protocol (MCP)
- Monitoring your deployment
diff --git a/deploy/docker/STRESS_TEST_PIPELINE.md b/deploy/docker/STRESS_TEST_PIPELINE.md
new file mode 100644
index 00000000..44025514
--- /dev/null
+++ b/deploy/docker/STRESS_TEST_PIPELINE.md
@@ -0,0 +1,241 @@
+# Crawl4AI Docker Memory & Pool Optimization - Implementation Log
+
+## Critical Issues Identified
+
+### Memory Management
+- **Host vs Container**: `psutil.virtual_memory()` reported host memory, not container limits
+- **Browser Pooling**: No pool reuse - every endpoint created new browsers
+- **Warmup Waste**: Permanent browser sat idle with mismatched config signature
+- **Idle Cleanup**: 30min TTL too long, janitor ran every 60s
+- **Endpoint Inconsistency**: 75% of endpoints bypassed pool (`/md`, `/html`, `/screenshot`, `/pdf`, `/execute_js`, `/llm`)
+
+### Pool Design Flaws
+- **Config Mismatch**: Permanent browser used `config.yml` args, endpoints used empty `BrowserConfig()`
+- **Logging Level**: Pool hit markers at DEBUG, invisible with INFO logging
+
+## Implementation Changes
+
+### 1. Container-Aware Memory Detection (`utils.py`)
+```python
+def get_container_memory_percent() -> float:
+ # Try cgroup v2 → v1 → fallback to psutil
+ # Reads /sys/fs/cgroup/memory.{current,max} OR memory/memory.{usage,limit}_in_bytes
+```
+
+### 2. Smart Browser Pool (`crawler_pool.py`)
+**3-Tier System:**
+- **PERMANENT**: Always-ready default browser (never cleaned)
+- **HOT_POOL**: Configs used 3+ times (longer TTL)
+- **COLD_POOL**: New/rare configs (short TTL)
+
+**Key Functions:**
+- `get_crawler(cfg)`: Check permanent → hot → cold → create new
+- `init_permanent(cfg)`: Initialize permanent at startup
+- `janitor()`: Adaptive cleanup (10s/30s/60s intervals based on memory)
+- `_sig(cfg)`: SHA1 hash of config dict for pool keys
+
+**Logging Fix**: Changed `logger.debug()` → `logger.info()` for pool hits
+
+### 3. Endpoint Unification
+**Helper Function** (`server.py`):
+```python
+def get_default_browser_config() -> BrowserConfig:
+ return BrowserConfig(
+ extra_args=config["crawler"]["browser"].get("extra_args", []),
+ **config["crawler"]["browser"].get("kwargs", {}),
+ )
+```
+
+**Migrated Endpoints:**
+- `/html`, `/screenshot`, `/pdf`, `/execute_js` → use `get_default_browser_config()`
+- `handle_llm_qa()`, `handle_markdown_request()` → same
+
+**Result**: All endpoints now hit permanent browser pool
+
+### 4. Config Updates (`config.yml`)
+- `idle_ttl_sec: 1800` → `300` (30min → 5min base TTL)
+- `port: 11234` → `11235` (fixed mismatch with Gunicorn)
+
+### 5. Lifespan Fix (`server.py`)
+```python
+await init_permanent(BrowserConfig(
+ extra_args=config["crawler"]["browser"].get("extra_args", []),
+ **config["crawler"]["browser"].get("kwargs", {}),
+))
+```
+Permanent browser now matches endpoint config signatures
+
+## Test Results
+
+### Test 1: Basic Health
+- 10 requests to `/health`
+- **Result**: 100% success, avg 3ms latency
+- **Baseline**: Container starts in ~5s, 270 MB idle
+
+### Test 2: Memory Monitoring
+- 20 requests with Docker stats tracking
+- **Result**: 100% success, no memory leak (-0.2 MB delta)
+- **Baseline**: 269.7 MB container overhead
+
+### Test 3: Pool Validation
+- 30 requests to `/html` endpoint
+- **Result**: **100% permanent browser hits**, 0 new browsers created
+- **Memory**: 287 MB baseline → 396 MB active (+109 MB)
+- **Latency**: Avg 4s (includes network to httpbin.org)
+
+### Test 4: Concurrent Load
+- Light (10) → Medium (50) → Heavy (100) concurrent
+- **Total**: 320 requests
+- **Result**: 100% success, **320/320 permanent hits**, 0 new browsers
+- **Memory**: 269 MB → peak 1533 MB → final 993 MB
+- **Latency**: P99 at 100 concurrent = 34s (expected with single browser)
+
+### Test 5: Pool Stress (Mixed Configs)
+- 20 requests with 4 different viewport configs
+- **Result**: 4 new browsers, 4 cold hits, **4 promotions to hot**, 8 hot hits
+- **Reuse Rate**: 60% (12 pool hits / 20 requests)
+- **Memory**: 270 MB → 928 MB peak (+658 MB = ~165 MB per browser)
+- **Proves**: Cold → hot promotion at 3 uses working perfectly
+
+### Test 6: Multi-Endpoint
+- 10 requests each: `/html`, `/screenshot`, `/pdf`, `/crawl`
+- **Result**: 100% success across all 4 endpoints
+- **Latency**: 5-8s avg (PDF slowest at 7.2s)
+
+### Test 7: Cleanup Verification
+- 20 requests (load spike) → 90s idle
+- **Memory**: 269 MB → peak 1107 MB → final 780 MB
+- **Recovery**: 327 MB (39%) - partial cleanup
+- **Note**: Hot pool browsers persist (by design), janitor working correctly
+
+## Performance Metrics
+
+| Metric | Before | After | Improvement |
+|--------|--------|-------|-------------|
+| Pool Reuse | 0% | 100% (default config) | ∞ |
+| Memory Leak | Unknown | 0 MB/cycle | Stable |
+| Browser Reuse | No | Yes | ~3-5s saved per request |
+| Idle Memory | 500-700 MB × N | 270-400 MB | 10x reduction |
+| Concurrent Capacity | ~20 | 100+ | 5x |
+
+## Key Learnings
+
+1. **Config Signature Matching**: Permanent browser MUST match endpoint default config exactly (SHA1 hash)
+2. **Logging Levels**: Pool diagnostics need INFO level, not DEBUG
+3. **Memory in Docker**: Must read cgroup files, not host metrics
+4. **Janitor Timing**: 60s interval adequate, but TTLs should be short (5min) for cold pool
+5. **Hot Promotion**: 3-use threshold works well for production patterns
+6. **Memory Per Browser**: ~150-200 MB per Chromium instance with headless + text_mode
+
+## Test Infrastructure
+
+**Location**: `deploy/docker/tests/`
+**Dependencies**: `httpx`, `docker` (Python SDK)
+**Pattern**: Sequential build - each test adds one capability
+
+**Files**:
+- `test_1_basic.py`: Health check + container lifecycle
+- `test_2_memory.py`: + Docker stats monitoring
+- `test_3_pool.py`: + Log analysis for pool markers
+- `test_4_concurrent.py`: + asyncio.Semaphore for concurrency control
+- `test_5_pool_stress.py`: + Config variants (viewports)
+- `test_6_multi_endpoint.py`: + Multiple endpoint testing
+- `test_7_cleanup.py`: + Time-series memory tracking for janitor
+
+**Run Pattern**:
+```bash
+cd deploy/docker/tests
+pip install -r requirements.txt
+# Rebuild after code changes:
+cd /path/to/repo && docker buildx build -t crawl4ai-local:latest --load .
+# Run test:
+python test_N_name.py
+```
+
+## Architecture Decisions
+
+**Why Permanent Browser?**
+- 90% of requests use default config → single browser serves most traffic
+- Eliminates 3-5s startup overhead per request
+
+**Why 3-Tier Pool?**
+- Permanent: Zero cost for common case
+- Hot: Amortized cost for frequent variants
+- Cold: Lazy allocation for rare configs
+
+**Why Adaptive Janitor?**
+- Memory pressure triggers aggressive cleanup
+- Low memory allows longer TTLs for better reuse
+
+**Why Not Close After Each Request?**
+- Browser startup: 3-5s overhead
+- Pool reuse: <100ms overhead
+- Net: 30-50x faster
+
+## Future Optimizations
+
+1. **Request Queuing**: When at capacity, queue instead of reject
+2. **Pre-warming**: Predict common configs, pre-create browsers
+3. **Metrics Export**: Prometheus metrics for pool efficiency
+4. **Config Normalization**: Group similar viewports (e.g., 1920±50 → 1920)
+
+## Critical Code Paths
+
+**Browser Acquisition** (`crawler_pool.py:34-78`):
+```
+get_crawler(cfg) →
+ _sig(cfg) →
+ if sig == DEFAULT_CONFIG_SIG → PERMANENT
+ elif sig in HOT_POOL → HOT_POOL[sig]
+ elif sig in COLD_POOL → promote if count >= 3
+ else → create new in COLD_POOL
+```
+
+**Janitor Loop** (`crawler_pool.py:107-146`):
+```
+while True:
+ mem% = get_container_memory_percent()
+ if mem% > 80: interval=10s, cold_ttl=30s
+ elif mem% > 60: interval=30s, cold_ttl=60s
+ else: interval=60s, cold_ttl=300s
+ sleep(interval)
+ close idle browsers (COLD then HOT)
+```
+
+**Endpoint Pattern** (`server.py` example):
+```python
+@app.post("/html")
+async def generate_html(...):
+ from crawler_pool import get_crawler
+ crawler = await get_crawler(get_default_browser_config())
+ results = await crawler.arun(url=body.url, config=cfg)
+ # No crawler.close() - returned to pool
+```
+
+## Debugging Tips
+
+**Check Pool Activity**:
+```bash
+docker logs crawl4ai-test | grep -E "(🔥|♨️|❄️|🆕|⬆️)"
+```
+
+**Verify Config Signature**:
+```python
+from crawl4ai import BrowserConfig
+import json, hashlib
+cfg = BrowserConfig(...)
+sig = hashlib.sha1(json.dumps(cfg.to_dict(), sort_keys=True).encode()).hexdigest()
+print(sig[:8]) # Compare with logs
+```
+
+**Monitor Memory**:
+```bash
+docker stats crawl4ai-test
+```
+
+## Known Limitations
+
+- **Mac Docker Stats**: CPU metrics unreliable, memory works
+- **PDF Generation**: Slowest endpoint (~7s), no optimization yet
+- **Hot Pool Persistence**: May hold memory longer than needed (trade-off for performance)
+- **Janitor Lag**: Up to 60s before cleanup triggers in low-memory scenarios
diff --git a/deploy/docker/WEBHOOK_EXAMPLES.md b/deploy/docker/WEBHOOK_EXAMPLES.md
new file mode 100644
index 00000000..190efb18
--- /dev/null
+++ b/deploy/docker/WEBHOOK_EXAMPLES.md
@@ -0,0 +1,378 @@
+# Webhook Feature Examples
+
+This document provides examples of how to use the webhook feature for crawl jobs in Crawl4AI.
+
+## Overview
+
+The webhook feature allows you to receive notifications when crawl jobs complete, eliminating the need for polling. Webhooks are sent with exponential backoff retry logic to ensure reliable delivery.
+
+## Configuration
+
+### Global Configuration (config.yml)
+
+You can configure default webhook settings in `config.yml`:
+
+```yaml
+webhooks:
+ enabled: true
+ default_url: null # Optional: default webhook URL for all jobs
+ data_in_payload: false # Optional: default behavior for including data
+ retry:
+ max_attempts: 5
+ initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff
+ max_delay_ms: 32000
+ timeout_ms: 30000 # 30s timeout per webhook call
+ headers: # Optional: default headers to include
+ User-Agent: "Crawl4AI-Webhook/1.0"
+```
+
+## API Usage Examples
+
+### Example 1: Basic Webhook (Notification Only)
+
+Send a webhook notification without including the crawl data in the payload.
+
+**Request:**
+```bash
+curl -X POST http://localhost:11235/crawl/job \
+ -H "Content-Type: application/json" \
+ -d '{
+ "urls": ["https://example.com"],
+ "webhook_config": {
+ "webhook_url": "https://myapp.com/webhooks/crawl-complete",
+ "webhook_data_in_payload": false
+ }
+ }'
+```
+
+**Response:**
+```json
+{
+ "task_id": "crawl_a1b2c3d4"
+}
+```
+
+**Webhook Payload Received:**
+```json
+{
+ "task_id": "crawl_a1b2c3d4",
+ "task_type": "crawl",
+ "status": "completed",
+ "timestamp": "2025-10-21T10:30:00.000000+00:00",
+ "urls": ["https://example.com"]
+}
+```
+
+Your webhook handler should then fetch the results:
+```bash
+curl http://localhost:11235/crawl/job/crawl_a1b2c3d4
+```
+
+### Example 2: Webhook with Data Included
+
+Include the full crawl results in the webhook payload.
+
+**Request:**
+```bash
+curl -X POST http://localhost:11235/crawl/job \
+ -H "Content-Type: application/json" \
+ -d '{
+ "urls": ["https://example.com"],
+ "webhook_config": {
+ "webhook_url": "https://myapp.com/webhooks/crawl-complete",
+ "webhook_data_in_payload": true
+ }
+ }'
+```
+
+**Webhook Payload Received:**
+```json
+{
+ "task_id": "crawl_a1b2c3d4",
+ "task_type": "crawl",
+ "status": "completed",
+ "timestamp": "2025-10-21T10:30:00.000000+00:00",
+ "urls": ["https://example.com"],
+ "data": {
+ "markdown": "...",
+ "html": "...",
+ "links": {...},
+ "metadata": {...}
+ }
+}
+```
+
+### Example 3: Webhook with Custom Headers
+
+Include custom headers for authentication or identification.
+
+**Request:**
+```bash
+curl -X POST http://localhost:11235/crawl/job \
+ -H "Content-Type: application/json" \
+ -d '{
+ "urls": ["https://example.com"],
+ "webhook_config": {
+ "webhook_url": "https://myapp.com/webhooks/crawl-complete",
+ "webhook_data_in_payload": false,
+ "webhook_headers": {
+ "X-Webhook-Secret": "my-secret-token",
+ "X-Service-ID": "crawl4ai-production"
+ }
+ }
+ }'
+```
+
+The webhook will be sent with these additional headers plus the default headers from config.
+
+### Example 4: Failure Notification
+
+When a crawl job fails, a webhook is sent with error details.
+
+**Webhook Payload on Failure:**
+```json
+{
+ "task_id": "crawl_a1b2c3d4",
+ "task_type": "crawl",
+ "status": "failed",
+ "timestamp": "2025-10-21T10:30:00.000000+00:00",
+ "urls": ["https://example.com"],
+ "error": "Connection timeout after 30s"
+}
+```
+
+### Example 5: Using Global Default Webhook
+
+If you set a `default_url` in config.yml, jobs without webhook_config will use it:
+
+**config.yml:**
+```yaml
+webhooks:
+ enabled: true
+ default_url: "https://myapp.com/webhooks/default"
+ data_in_payload: false
+```
+
+**Request (no webhook_config needed):**
+```bash
+curl -X POST http://localhost:11235/crawl/job \
+ -H "Content-Type: application/json" \
+ -d '{
+ "urls": ["https://example.com"]
+ }'
+```
+
+The webhook will be sent to the default URL configured in config.yml.
+
+### Example 6: LLM Extraction Job with Webhook
+
+Use webhooks with the LLM extraction endpoint for asynchronous processing.
+
+**Request:**
+```bash
+curl -X POST http://localhost:11235/llm/job \
+ -H "Content-Type: application/json" \
+ -d '{
+ "url": "https://example.com/article",
+ "q": "Extract the article title, author, and publication date",
+ "schema": "{\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\"}, \"author\": {\"type\": \"string\"}, \"date\": {\"type\": \"string\"}}}",
+ "cache": false,
+ "provider": "openai/gpt-4o-mini",
+ "webhook_config": {
+ "webhook_url": "https://myapp.com/webhooks/llm-complete",
+ "webhook_data_in_payload": true
+ }
+ }'
+```
+
+**Response:**
+```json
+{
+ "task_id": "llm_1698765432_12345"
+}
+```
+
+**Webhook Payload Received:**
+```json
+{
+ "task_id": "llm_1698765432_12345",
+ "task_type": "llm_extraction",
+ "status": "completed",
+ "timestamp": "2025-10-21T10:30:00.000000+00:00",
+ "urls": ["https://example.com/article"],
+ "data": {
+ "extracted_content": {
+ "title": "Understanding Web Scraping",
+ "author": "John Doe",
+ "date": "2025-10-21"
+ }
+ }
+}
+```
+
+## Webhook Handler Example
+
+Here's a simple Python Flask webhook handler that supports both crawl and LLM extraction jobs:
+
+```python
+from flask import Flask, request, jsonify
+import requests
+
+app = Flask(__name__)
+
+@app.route('/webhooks/crawl-complete', methods=['POST'])
+def handle_crawl_webhook():
+ payload = request.json
+
+ task_id = payload['task_id']
+ task_type = payload['task_type']
+ status = payload['status']
+
+ if status == 'completed':
+ # If data not in payload, fetch it
+ if 'data' not in payload:
+ # Determine endpoint based on task type
+ endpoint = 'crawl' if task_type == 'crawl' else 'llm'
+ response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}')
+ data = response.json()
+ else:
+ data = payload['data']
+
+ # Process based on task type
+ if task_type == 'crawl':
+ print(f"Processing crawl results for {task_id}")
+ # Handle crawl results
+ results = data.get('results', [])
+ for result in results:
+ print(f" - {result.get('url')}: {len(result.get('markdown', ''))} chars")
+
+ elif task_type == 'llm_extraction':
+ print(f"Processing LLM extraction for {task_id}")
+ # Handle LLM extraction
+ # Note: Webhook sends 'extracted_content', API returns 'result'
+ extracted = data.get('extracted_content', data.get('result', {}))
+ print(f" - Extracted: {extracted}")
+
+ # Your business logic here...
+
+ elif status == 'failed':
+ error = payload.get('error', 'Unknown error')
+ print(f"{task_type} job {task_id} failed: {error}")
+ # Handle failure...
+
+ return jsonify({"status": "received"}), 200
+
+if __name__ == '__main__':
+ app.run(port=8080)
+```
+
+## Retry Logic
+
+The webhook delivery service uses exponential backoff retry logic:
+
+- **Attempts:** Up to 5 attempts by default
+- **Delays:** 1s → 2s → 4s → 8s → 16s
+- **Timeout:** 30 seconds per attempt
+- **Retry Conditions:**
+ - Server errors (5xx status codes)
+ - Network errors
+ - Timeouts
+- **No Retry:**
+ - Client errors (4xx status codes)
+ - Successful delivery (2xx status codes)
+
+## Benefits
+
+1. **No Polling Required** - Eliminates constant API calls to check job status
+2. **Real-time Notifications** - Immediate notification when jobs complete
+3. **Reliable Delivery** - Exponential backoff ensures webhooks are delivered
+4. **Flexible** - Choose between notification-only or full data delivery
+5. **Secure** - Support for custom headers for authentication
+6. **Configurable** - Global defaults or per-job configuration
+7. **Universal Support** - Works with both `/crawl/job` and `/llm/job` endpoints
+
+## TypeScript Client Example
+
+```typescript
+interface WebhookConfig {
+ webhook_url: string;
+ webhook_data_in_payload?: boolean;
+ webhook_headers?: Record;
+}
+
+interface CrawlJobRequest {
+ urls: string[];
+ browser_config?: Record;
+ crawler_config?: Record;
+ webhook_config?: WebhookConfig;
+}
+
+interface LLMJobRequest {
+ url: string;
+ q: string;
+ schema?: string;
+ cache?: boolean;
+ provider?: string;
+ webhook_config?: WebhookConfig;
+}
+
+async function createCrawlJob(request: CrawlJobRequest) {
+ const response = await fetch('http://localhost:11235/crawl/job', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(request)
+ });
+
+ const { task_id } = await response.json();
+ return task_id;
+}
+
+async function createLLMJob(request: LLMJobRequest) {
+ const response = await fetch('http://localhost:11235/llm/job', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(request)
+ });
+
+ const { task_id } = await response.json();
+ return task_id;
+}
+
+// Usage - Crawl Job
+const crawlTaskId = await createCrawlJob({
+ urls: ['https://example.com'],
+ webhook_config: {
+ webhook_url: 'https://myapp.com/webhooks/crawl-complete',
+ webhook_data_in_payload: false,
+ webhook_headers: {
+ 'X-Webhook-Secret': 'my-secret'
+ }
+ }
+});
+
+// Usage - LLM Extraction Job
+const llmTaskId = await createLLMJob({
+ url: 'https://example.com/article',
+ q: 'Extract the main points from this article',
+ provider: 'openai/gpt-4o-mini',
+ webhook_config: {
+ webhook_url: 'https://myapp.com/webhooks/llm-complete',
+ webhook_data_in_payload: true,
+ webhook_headers: {
+ 'X-Webhook-Secret': 'my-secret'
+ }
+ }
+});
+```
+
+## Monitoring and Debugging
+
+Webhook delivery attempts are logged at INFO level:
+- Successful deliveries
+- Retry attempts with delays
+- Final failures after max attempts
+
+Check the application logs for webhook delivery status:
+```bash
+docker logs crawl4ai-container | grep -i webhook
+```
diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index 58d8c01f..4fab27b1 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -4,7 +4,7 @@ import asyncio
from typing import List, Tuple, Dict
from functools import partial
from uuid import uuid4
-from datetime import datetime
+from datetime import datetime, timezone
from base64 import b64encode
import logging
@@ -42,8 +42,11 @@ from utils import (
should_cleanup_task,
decode_redis_hash,
get_llm_api_key,
- validate_llm_provider
+ validate_llm_provider,
+ get_llm_temperature,
+ get_llm_base_url
)
+from webhook import WebhookDeliveryService
import psutil, time
@@ -64,6 +67,7 @@ async def handle_llm_qa(
config: dict
) -> str:
"""Process QA using LLM with crawled content as context."""
+ from crawler_pool import get_crawler
try:
if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")):
url = 'https://' + url
@@ -72,15 +76,21 @@ async def handle_llm_qa(
if last_q_index != -1:
url = url[:last_q_index]
- # Get markdown content
- async with AsyncWebCrawler() as crawler:
- result = await crawler.arun(url)
- if not result.success:
- raise HTTPException(
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
- detail=result.error_message
- )
- content = result.markdown.fit_markdown or result.markdown.raw_markdown
+ # Get markdown content (use default config)
+ from utils import load_config
+ cfg = load_config()
+ browser_cfg = BrowserConfig(
+ extra_args=cfg["crawler"]["browser"].get("extra_args", []),
+ **cfg["crawler"]["browser"].get("kwargs", {}),
+ )
+ crawler = await get_crawler(browser_cfg)
+ result = await crawler.arun(url)
+ if not result.success:
+ raise HTTPException(
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+ detail=result.error_message
+ )
+ content = result.markdown.fit_markdown or result.markdown.raw_markdown
# Create prompt and get LLM response
prompt = f"""Use the following content as context to answer the question.
@@ -96,7 +106,9 @@ async def handle_llm_qa(
response = perform_completion_with_backoff(
provider=config["llm"]["provider"],
prompt_with_variables=prompt,
- api_token=get_llm_api_key(config)
+ api_token=get_llm_api_key(config), # Returns None to let litellm handle it
+ temperature=get_llm_temperature(config),
+ base_url=get_llm_base_url(config)
)
return response.choices[0].message.content
@@ -115,9 +127,15 @@ async def process_llm_extraction(
instruction: str,
schema: Optional[str] = None,
cache: str = "0",
- provider: Optional[str] = None
+ provider: Optional[str] = None,
+ webhook_config: Optional[Dict] = None,
+ temperature: Optional[float] = None,
+ base_url: Optional[str] = None
) -> None:
"""Process LLM extraction in background."""
+ # Initialize webhook service
+ webhook_service = WebhookDeliveryService(config)
+
try:
# Validate provider
is_valid, error_msg = validate_llm_provider(config, provider)
@@ -126,12 +144,24 @@ async def process_llm_extraction(
"status": TaskStatus.FAILED,
"error": error_msg
})
+
+ # Send webhook notification on failure
+ await webhook_service.notify_job_completion(
+ task_id=task_id,
+ task_type="llm_extraction",
+ status="failed",
+ urls=[url],
+ webhook_config=webhook_config,
+ error=error_msg
+ )
return
- api_key = get_llm_api_key(config, provider)
+ api_key = get_llm_api_key(config, provider) # Returns None to let litellm handle it
llm_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(
provider=provider or config["llm"]["provider"],
- api_token=api_key
+ api_token=api_key,
+ temperature=temperature or get_llm_temperature(config, provider),
+ base_url=base_url or get_llm_base_url(config, provider)
),
instruction=instruction,
schema=json.loads(schema) if schema else None,
@@ -154,17 +184,40 @@ async def process_llm_extraction(
"status": TaskStatus.FAILED,
"error": result.error_message
})
+
+ # Send webhook notification on failure
+ await webhook_service.notify_job_completion(
+ task_id=task_id,
+ task_type="llm_extraction",
+ status="failed",
+ urls=[url],
+ webhook_config=webhook_config,
+ error=result.error_message
+ )
return
try:
content = json.loads(result.extracted_content)
except json.JSONDecodeError:
content = result.extracted_content
+
+ result_data = {"extracted_content": content}
+
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.COMPLETED,
"result": json.dumps(content)
})
+ # Send webhook notification on successful completion
+ await webhook_service.notify_job_completion(
+ task_id=task_id,
+ task_type="llm_extraction",
+ status="completed",
+ urls=[url],
+ webhook_config=webhook_config,
+ result=result_data
+ )
+
except Exception as e:
logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
await redis.hset(f"task:{task_id}", mapping={
@@ -172,13 +225,25 @@ async def process_llm_extraction(
"error": str(e)
})
+ # Send webhook notification on failure
+ await webhook_service.notify_job_completion(
+ task_id=task_id,
+ task_type="llm_extraction",
+ status="failed",
+ urls=[url],
+ webhook_config=webhook_config,
+ error=str(e)
+ )
+
async def handle_markdown_request(
url: str,
filter_type: FilterType,
query: Optional[str] = None,
cache: str = "0",
config: Optional[dict] = None,
- provider: Optional[str] = None
+ provider: Optional[str] = None,
+ temperature: Optional[float] = None,
+ base_url: Optional[str] = None
) -> str:
"""Handle markdown generation requests."""
try:
@@ -203,7 +268,9 @@ async def handle_markdown_request(
FilterType.LLM: LLMContentFilter(
llm_config=LLMConfig(
provider=provider or config["llm"]["provider"],
- api_token=get_llm_api_key(config, provider),
+ api_token=get_llm_api_key(config, provider), # Returns None to let litellm handle it
+ temperature=temperature or get_llm_temperature(config, provider),
+ base_url=base_url or get_llm_base_url(config, provider)
),
instruction=query or "Extract main content"
)
@@ -212,25 +279,32 @@ async def handle_markdown_request(
cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
- async with AsyncWebCrawler() as crawler:
- result = await crawler.arun(
- url=decoded_url,
- config=CrawlerRunConfig(
- markdown_generator=md_generator,
- scraping_strategy=LXMLWebScrapingStrategy(),
- cache_mode=cache_mode
- )
+ from crawler_pool import get_crawler
+ from utils import load_config as _load_config
+ _cfg = _load_config()
+ browser_cfg = BrowserConfig(
+ extra_args=_cfg["crawler"]["browser"].get("extra_args", []),
+ **_cfg["crawler"]["browser"].get("kwargs", {}),
+ )
+ crawler = await get_crawler(browser_cfg)
+ result = await crawler.arun(
+ url=decoded_url,
+ config=CrawlerRunConfig(
+ markdown_generator=md_generator,
+ scraping_strategy=LXMLWebScrapingStrategy(),
+ cache_mode=cache_mode
)
-
- if not result.success:
- raise HTTPException(
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
- detail=result.error_message
- )
+ )
- return (result.markdown.raw_markdown
- if filter_type == FilterType.RAW
- else result.markdown.fit_markdown)
+ if not result.success:
+ raise HTTPException(
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+ detail=result.error_message
+ )
+
+ return (result.markdown.raw_markdown
+ if filter_type == FilterType.RAW
+ else result.markdown.fit_markdown)
except Exception as e:
logger.error(f"Markdown error: {str(e)}", exc_info=True)
@@ -248,7 +322,10 @@ async def handle_llm_request(
schema: Optional[str] = None,
cache: str = "0",
config: Optional[dict] = None,
- provider: Optional[str] = None
+ provider: Optional[str] = None,
+ webhook_config: Optional[Dict] = None,
+ temperature: Optional[float] = None,
+ api_base_url: Optional[str] = None
) -> JSONResponse:
"""Handle LLM extraction requests."""
base_url = get_base_url(request)
@@ -279,7 +356,10 @@ async def handle_llm_request(
cache,
base_url,
config,
- provider
+ provider,
+ webhook_config,
+ temperature,
+ api_base_url
)
except Exception as e:
@@ -324,7 +404,10 @@ async def create_new_task(
cache: str,
base_url: str,
config: dict,
- provider: Optional[str] = None
+ provider: Optional[str] = None,
+ webhook_config: Optional[Dict] = None,
+ temperature: Optional[float] = None,
+ api_base_url: Optional[str] = None
) -> JSONResponse:
"""Create and initialize a new task."""
decoded_url = unquote(input_path)
@@ -333,12 +416,18 @@ async def create_new_task(
from datetime import datetime
task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
-
- await redis.hset(f"task:{task_id}", mapping={
+
+ task_data = {
"status": TaskStatus.PROCESSING,
"created_at": datetime.now().isoformat(),
"url": decoded_url
- })
+ }
+
+ # Store webhook config if provided
+ if webhook_config:
+ task_data["webhook_config"] = json.dumps(webhook_config)
+
+ await redis.hset(f"task:{task_id}", mapping=task_data)
background_tasks.add_task(
process_llm_extraction,
@@ -349,7 +438,10 @@ async def create_new_task(
query,
schema,
cache,
- provider
+ provider,
+ webhook_config,
+ temperature,
+ api_base_url
)
return JSONResponse({
@@ -393,6 +485,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
server_memory_mb = _get_memory_mb()
result_dict = result.model_dump()
result_dict['server_memory_mb'] = server_memory_mb
+ # Ensure fit_html is JSON-serializable
+ if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
+ result_dict["fit_html"] = None
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
@@ -419,14 +514,26 @@ async def handle_crawl_request(
urls: List[str],
browser_config: dict,
crawler_config: dict,
- config: dict
+ config: dict,
+ hooks_config: Optional[dict] = None
) -> dict:
- """Handle non-streaming crawl requests."""
+ """Handle non-streaming crawl requests with optional hooks."""
+ # Track request start
+ request_id = f"req_{uuid4().hex[:8]}"
+ try:
+ from monitor import get_monitor
+ await get_monitor().track_request_start(
+ request_id, "/crawl", urls[0] if urls else "batch", browser_config
+ )
+ except:
+ pass # Monitor not critical
+
start_mem_mb = _get_memory_mb() # <--- Get memory before
start_time = time.time()
mem_delta_mb = None
peak_mem_mb = start_mem_mb
-
+ hook_manager = None
+
try:
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
browser_config = BrowserConfig.load(browser_config)
@@ -445,11 +552,27 @@ async def handle_crawl_request(
# crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()
+ # Attach hooks if provided
+ hooks_status = {}
+ if hooks_config:
+ from hook_manager import attach_user_hooks_to_crawler, UserHookManager
+ hook_manager = UserHookManager(timeout=hooks_config.get('timeout', 30))
+ hooks_status, hook_manager = await attach_user_hooks_to_crawler(
+ crawler,
+ hooks_config.get('code', {}),
+ timeout=hooks_config.get('timeout', 30),
+ hook_manager=hook_manager
+ )
+ logger.info(f"Hooks attachment status: {hooks_status['status']}")
+
base_config = config["crawler"]["base_config"]
- # Iterate on key-value pairs in global_config then use haseattr to set them
+ # Iterate on key-value pairs in global_config then use hasattr to set them
for key, value in base_config.items():
if hasattr(crawler_config, key):
- setattr(crawler_config, key, value)
+ current_value = getattr(crawler_config, key)
+ # Only set base config if user didn't provide a value
+ if current_value is None or current_value == "":
+ setattr(crawler_config, key, value)
results = []
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
@@ -458,6 +581,10 @@ async def handle_crawl_request(
config=crawler_config,
dispatcher=dispatcher)
results = await partial_func()
+
+ # Ensure results is always a list
+ if not isinstance(results, list):
+ results = [results]
# await crawler.close()
@@ -472,13 +599,39 @@ async def handle_crawl_request(
# Process results to handle PDF bytes
processed_results = []
for result in results:
- result_dict = result.model_dump()
- # If PDF exists, encode it to base64
- if result_dict.get('pdf') is not None:
- result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
- processed_results.append(result_dict)
+ try:
+ # Check if result has model_dump method (is a proper CrawlResult)
+ if hasattr(result, 'model_dump'):
+ result_dict = result.model_dump()
+ elif isinstance(result, dict):
+ result_dict = result
+ else:
+ # Handle unexpected result type
+ logger.warning(f"Unexpected result type: {type(result)}")
+ result_dict = {
+ "url": str(result) if hasattr(result, '__str__') else "unknown",
+ "success": False,
+ "error_message": f"Unexpected result type: {type(result).__name__}"
+ }
+
+ # if fit_html is not a string, set it to None to avoid serialization errors
+ if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
+ result_dict["fit_html"] = None
+
+ # If PDF exists, encode it to base64
+ if result_dict.get('pdf') is not None and isinstance(result_dict.get('pdf'), bytes):
+ result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
+
+ processed_results.append(result_dict)
+ except Exception as e:
+ logger.error(f"Error processing result: {e}")
+ processed_results.append({
+ "url": "unknown",
+ "success": False,
+ "error_message": str(e)
+ })
- return {
+ response = {
"success": True,
"results": processed_results,
"server_processing_time_s": end_time - start_time,
@@ -486,8 +639,53 @@ async def handle_crawl_request(
"server_peak_memory_mb": peak_mem_mb
}
+ # Track request completion
+ try:
+ from monitor import get_monitor
+ await get_monitor().track_request_end(
+ request_id, success=True, pool_hit=True, status_code=200
+ )
+ except:
+ pass
+
+ # Add hooks information if hooks were used
+ if hooks_config and hook_manager:
+ from hook_manager import UserHookManager
+ if isinstance(hook_manager, UserHookManager):
+ try:
+ # Ensure all hook data is JSON serializable
+ hook_data = {
+ "status": hooks_status,
+ "execution_log": hook_manager.execution_log,
+ "errors": hook_manager.errors,
+ "summary": hook_manager.get_summary()
+ }
+ # Test that it's serializable
+ json.dumps(hook_data)
+ response["hooks"] = hook_data
+ except (TypeError, ValueError) as e:
+ logger.error(f"Hook data not JSON serializable: {e}")
+ response["hooks"] = {
+ "status": {"status": "error", "message": "Hook data serialization failed"},
+ "execution_log": [],
+ "errors": [{"error": str(e)}],
+ "summary": {}
+ }
+
+ return response
+
except Exception as e:
logger.error(f"Crawl error: {str(e)}", exc_info=True)
+
+ # Track request error
+ try:
+ from monitor import get_monitor
+ await get_monitor().track_request_end(
+ request_id, success=False, error=str(e), status_code=500
+ )
+ except:
+ pass
+
if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
# try:
# await crawler.close()
@@ -513,9 +711,11 @@ async def handle_stream_crawl_request(
urls: List[str],
browser_config: dict,
crawler_config: dict,
- config: dict
-) -> Tuple[AsyncWebCrawler, AsyncGenerator]:
- """Handle streaming crawl requests."""
+ config: dict,
+ hooks_config: Optional[dict] = None
+) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
+ """Handle streaming crawl requests with optional hooks."""
+ hooks_info = None
try:
browser_config = BrowserConfig.load(browser_config)
# browser_config.verbose = True # Set to False or remove for production stress testing
@@ -536,6 +736,20 @@ async def handle_stream_crawl_request(
# crawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()
+
+ # Attach hooks if provided
+ if hooks_config:
+ from hook_manager import attach_user_hooks_to_crawler, UserHookManager
+ hook_manager = UserHookManager(timeout=hooks_config.get('timeout', 30))
+ hooks_status, hook_manager = await attach_user_hooks_to_crawler(
+ crawler,
+ hooks_config.get('code', {}),
+ timeout=hooks_config.get('timeout', 30),
+ hook_manager=hook_manager
+ )
+ logger.info(f"Hooks attachment status for streaming: {hooks_status['status']}")
+ # Include hook manager in hooks_info for proper tracking
+ hooks_info = {'status': hooks_status, 'manager': hook_manager}
results_gen = await crawler.arun_many(
urls=urls,
@@ -543,7 +757,7 @@ async def handle_stream_crawl_request(
dispatcher=dispatcher
)
- return crawler, results_gen
+ return crawler, results_gen, hooks_info
except Exception as e:
# Make sure to close crawler if started during an error here
@@ -567,6 +781,7 @@ async def handle_crawl_job(
browser_config: Dict,
crawler_config: Dict,
config: Dict,
+ webhook_config: Optional[Dict] = None,
) -> Dict:
"""
Fire-and-forget version of handle_crawl_request.
@@ -574,13 +789,24 @@ async def handle_crawl_job(
lets /crawl/job/{task_id} polling fetch the result.
"""
task_id = f"crawl_{uuid4().hex[:8]}"
- await redis.hset(f"task:{task_id}", mapping={
+
+ # Store task data in Redis
+ task_data = {
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
- "created_at": datetime.utcnow().isoformat(),
+ "created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
"url": json.dumps(urls), # store list as JSON string
"result": "",
"error": "",
- })
+ }
+
+ # Store webhook config if provided
+ if webhook_config:
+ task_data["webhook_config"] = json.dumps(webhook_config)
+
+ await redis.hset(f"task:{task_id}", mapping=task_data)
+
+ # Initialize webhook service
+ webhook_service = WebhookDeliveryService(config)
async def _runner():
try:
@@ -594,6 +820,17 @@ async def handle_crawl_job(
"status": TaskStatus.COMPLETED,
"result": json.dumps(result),
})
+
+ # Send webhook notification on successful completion
+ await webhook_service.notify_job_completion(
+ task_id=task_id,
+ task_type="crawl",
+ status="completed",
+ urls=urls,
+ webhook_config=webhook_config,
+ result=result
+ )
+
await asyncio.sleep(5) # Give Redis time to process the update
except Exception as exc:
await redis.hset(f"task:{task_id}", mapping={
@@ -601,5 +838,15 @@ async def handle_crawl_job(
"error": str(exc),
})
+ # Send webhook notification on failure
+ await webhook_service.notify_job_completion(
+ task_id=task_id,
+ task_type="crawl",
+ status="failed",
+ urls=urls,
+ webhook_config=webhook_config,
+ error=str(exc)
+ )
+
background_tasks.add_task(_runner)
return {"task_id": task_id}
\ No newline at end of file
diff --git a/deploy/docker/auth.py b/deploy/docker/auth.py
index f9e75d78..6fcef339 100644
--- a/deploy/docker/auth.py
+++ b/deploy/docker/auth.py
@@ -28,25 +28,43 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -
signing_key = get_jwk_from_secret(SECRET_KEY)
return instance.encode(to_encode, signing_key, alg='HS256')
-def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
+def verify_token(credentials: HTTPAuthorizationCredentials) -> Dict:
"""Verify the JWT token from the Authorization header."""
-
- if credentials is None:
- return None
+
+ if not credentials or not credentials.credentials:
+ raise HTTPException(
+ status_code=401,
+ detail="No token provided",
+ headers={"WWW-Authenticate": "Bearer"}
+ )
+
token = credentials.credentials
verifying_key = get_jwk_from_secret(SECRET_KEY)
try:
payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
return payload
- except Exception:
- raise HTTPException(status_code=401, detail="Invalid or expired token")
+ except Exception as e:
+ raise HTTPException(
+ status_code=401,
+ detail=f"Invalid or expired token: {str(e)}",
+ headers={"WWW-Authenticate": "Bearer"}
+ )
def get_token_dependency(config: Dict):
"""Return the token dependency if JWT is enabled, else a function that returns None."""
-
+
if config.get("security", {}).get("jwt_enabled", False):
- return verify_token
+ def jwt_required(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
+ """Enforce JWT authentication when enabled."""
+ if credentials is None:
+ raise HTTPException(
+ status_code=401,
+ detail="Authentication required. Please provide a valid Bearer token.",
+ headers={"WWW-Authenticate": "Bearer"}
+ )
+ return verify_token(credentials)
+ return jwt_required
else:
return lambda: None
diff --git a/deploy/docker/c4ai-code-context.md b/deploy/docker/c4ai-code-context.md
index eb29b94c..c18fbc78 100644
--- a/deploy/docker/c4ai-code-context.md
+++ b/deploy/docker/c4ai-code-context.md
@@ -7520,17 +7520,18 @@ class BrowserManager:
)
os.makedirs(browser_args["downloads_path"], exist_ok=True)
- if self.config.proxy or self.config.proxy_config:
+ if self.config.proxy:
+ warnings.warn(
+ "BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
+ DeprecationWarning,
+ )
+ if self.config.proxy_config:
from playwright.async_api import ProxySettings
- proxy_settings = (
- ProxySettings(server=self.config.proxy)
- if self.config.proxy
- else ProxySettings(
- server=self.config.proxy_config.server,
- username=self.config.proxy_config.username,
- password=self.config.proxy_config.password,
- )
+ proxy_settings = ProxySettings(
+ server=self.config.proxy_config.server,
+ username=self.config.proxy_config.username,
+ password=self.config.proxy_config.password,
)
browser_args["proxy"] = proxy_settings
diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md
index 74ad794f..abfd3637 100644
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -2241,7 +2241,7 @@ docker build -t crawl4ai
| Argument | Description | Default | Options |
|----------|-------------|---------|----------|
-| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 |
+| PYTHON_VERSION | Python version | 3.10 | 3.10, 3.11, 3.12, 3.13 |
| INSTALL_TYPE | Feature set | default | default, all, torch, transformer |
| ENABLE_GPU | GPU support | false | true, false |
| APP_HOME | Install path | /app | any valid path |
diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml
index c81badc4..5790d5be 100644
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -3,7 +3,7 @@ app:
title: "Crawl4AI API"
version: "1.0.0"
host: "0.0.0.0"
- port: 11234
+ port: 11235
reload: False
workers: 1
timeout_keep_alive: 300
@@ -11,8 +11,7 @@ app:
# Default LLM Configuration
llm:
provider: "openai/gpt-4o-mini"
- api_key_env: "OPENAI_API_KEY"
- # api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
+ # api_key: sk-... # If you pass the API key directly (not recommended)
# Redis Configuration
redis:
@@ -39,8 +38,8 @@ rate_limiting:
# Security Configuration
security:
- enabled: false
- jwt_enabled: false
+ enabled: false
+ jwt_enabled: false
https_redirect: false
trusted_hosts: ["*"]
headers:
@@ -62,7 +61,7 @@ crawler:
batch_process: 300.0 # Timeout for batch processing
pool:
max_pages: 40 # ← GLOBAL_SEM permits
- idle_ttl_sec: 1800 # ← 30 min janitor cutoff
+ idle_ttl_sec: 300 # ← 30 min janitor cutoff
browser:
kwargs:
headless: true
@@ -88,4 +87,17 @@ observability:
enabled: True
endpoint: "/metrics"
health_check:
- endpoint: "/health"
\ No newline at end of file
+ endpoint: "/health"
+
+# Webhook Configuration
+webhooks:
+ enabled: true
+ default_url: null # Optional: default webhook URL for all jobs
+ data_in_payload: false # Optional: default behavior for including data
+ retry:
+ max_attempts: 5
+ initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff
+ max_delay_ms: 32000
+ timeout_ms: 30000 # 30s timeout per webhook call
+ headers: # Optional: default headers to include
+ User-Agent: "Crawl4AI-Webhook/1.0"
\ No newline at end of file
diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py
index d15102e4..509cbba9 100644
--- a/deploy/docker/crawler_pool.py
+++ b/deploy/docker/crawler_pool.py
@@ -1,60 +1,170 @@
-# crawler_pool.py (new file)
-import asyncio, json, hashlib, time, psutil
+# crawler_pool.py - Smart browser pool with tiered management
+import asyncio, json, hashlib, time
from contextlib import suppress
-from typing import Dict
+from typing import Dict, Optional
from crawl4ai import AsyncWebCrawler, BrowserConfig
-from typing import Dict
-from utils import load_config
+from utils import load_config, get_container_memory_percent
+import logging
+logger = logging.getLogger(__name__)
CONFIG = load_config()
-POOL: Dict[str, AsyncWebCrawler] = {}
+# Pool tiers
+PERMANENT: Optional[AsyncWebCrawler] = None # Always-ready default browser
+HOT_POOL: Dict[str, AsyncWebCrawler] = {} # Frequent configs
+COLD_POOL: Dict[str, AsyncWebCrawler] = {} # Rare configs
LAST_USED: Dict[str, float] = {}
+USAGE_COUNT: Dict[str, int] = {}
LOCK = asyncio.Lock()
-MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0) # % RAM – refuse new browsers above this
-IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800) # close if unused for 30 min
+# Config
+MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0)
+BASE_IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 300)
+DEFAULT_CONFIG_SIG = None # Cached sig for default config
def _sig(cfg: BrowserConfig) -> str:
+ """Generate config signature."""
payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
return hashlib.sha1(payload.encode()).hexdigest()
+def _is_default_config(sig: str) -> bool:
+ """Check if config matches default."""
+ return sig == DEFAULT_CONFIG_SIG
+
async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
- try:
- sig = _sig(cfg)
- async with LOCK:
- if sig in POOL:
- LAST_USED[sig] = time.time();
- return POOL[sig]
- if psutil.virtual_memory().percent >= MEM_LIMIT:
- raise MemoryError("RAM pressure – new browser denied")
- crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
- await crawler.start()
- POOL[sig] = crawler; LAST_USED[sig] = time.time()
- return crawler
- except MemoryError as e:
- raise MemoryError(f"RAM pressure – new browser denied: {e}")
- except Exception as e:
- raise RuntimeError(f"Failed to start browser: {e}")
- finally:
- if sig in POOL:
- LAST_USED[sig] = time.time()
- else:
- # If we failed to start the browser, we should remove it from the pool
- POOL.pop(sig, None)
- LAST_USED.pop(sig, None)
- # If we failed to start the browser, we should remove it from the pool
-async def close_all():
+ """Get crawler from pool with tiered strategy."""
+ sig = _sig(cfg)
async with LOCK:
- await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
- POOL.clear(); LAST_USED.clear()
+ # Check permanent browser for default config
+ if PERMANENT and _is_default_config(sig):
+ LAST_USED[sig] = time.time()
+ USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
+ logger.info("🔥 Using permanent browser")
+ return PERMANENT
+
+ # Check hot pool
+ if sig in HOT_POOL:
+ LAST_USED[sig] = time.time()
+ USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
+ logger.info(f"♨️ Using hot pool browser (sig={sig[:8]})")
+ return HOT_POOL[sig]
+
+ # Check cold pool (promote to hot if used 3+ times)
+ if sig in COLD_POOL:
+ LAST_USED[sig] = time.time()
+ USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
+
+ if USAGE_COUNT[sig] >= 3:
+ logger.info(f"⬆️ Promoting to hot pool (sig={sig[:8]}, count={USAGE_COUNT[sig]})")
+ HOT_POOL[sig] = COLD_POOL.pop(sig)
+
+ # Track promotion in monitor
+ try:
+ from monitor import get_monitor
+ await get_monitor().track_janitor_event("promote", sig, {"count": USAGE_COUNT[sig]})
+ except:
+ pass
+
+ return HOT_POOL[sig]
+
+ logger.info(f"❄️ Using cold pool browser (sig={sig[:8]})")
+ return COLD_POOL[sig]
+
+ # Memory check before creating new
+ mem_pct = get_container_memory_percent()
+ if mem_pct >= MEM_LIMIT:
+ logger.error(f"💥 Memory pressure: {mem_pct:.1f}% >= {MEM_LIMIT}%")
+ raise MemoryError(f"Memory at {mem_pct:.1f}%, refusing new browser")
+
+ # Create new in cold pool
+ logger.info(f"🆕 Creating new browser in cold pool (sig={sig[:8]}, mem={mem_pct:.1f}%)")
+ crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
+ await crawler.start()
+ COLD_POOL[sig] = crawler
+ LAST_USED[sig] = time.time()
+ USAGE_COUNT[sig] = 1
+ return crawler
+
+async def init_permanent(cfg: BrowserConfig):
+ """Initialize permanent default browser."""
+ global PERMANENT, DEFAULT_CONFIG_SIG
+ async with LOCK:
+ if PERMANENT:
+ return
+ DEFAULT_CONFIG_SIG = _sig(cfg)
+ logger.info("🔥 Creating permanent default browser")
+ PERMANENT = AsyncWebCrawler(config=cfg, thread_safe=False)
+ await PERMANENT.start()
+ LAST_USED[DEFAULT_CONFIG_SIG] = time.time()
+ USAGE_COUNT[DEFAULT_CONFIG_SIG] = 0
+
+async def close_all():
+ """Close all browsers."""
+ async with LOCK:
+ tasks = []
+ if PERMANENT:
+ tasks.append(PERMANENT.close())
+ tasks.extend([c.close() for c in HOT_POOL.values()])
+ tasks.extend([c.close() for c in COLD_POOL.values()])
+ await asyncio.gather(*tasks, return_exceptions=True)
+ HOT_POOL.clear()
+ COLD_POOL.clear()
+ LAST_USED.clear()
+ USAGE_COUNT.clear()
async def janitor():
+ """Adaptive cleanup based on memory pressure."""
while True:
- await asyncio.sleep(60)
+ mem_pct = get_container_memory_percent()
+
+ # Adaptive intervals and TTLs
+ if mem_pct > 80:
+ interval, cold_ttl, hot_ttl = 10, 30, 120
+ elif mem_pct > 60:
+ interval, cold_ttl, hot_ttl = 30, 60, 300
+ else:
+ interval, cold_ttl, hot_ttl = 60, BASE_IDLE_TTL, BASE_IDLE_TTL * 2
+
+ await asyncio.sleep(interval)
+
now = time.time()
async with LOCK:
- for sig, crawler in list(POOL.items()):
- if now - LAST_USED[sig] > IDLE_TTL:
- with suppress(Exception): await crawler.close()
- POOL.pop(sig, None); LAST_USED.pop(sig, None)
+ # Clean cold pool
+ for sig in list(COLD_POOL.keys()):
+ if now - LAST_USED.get(sig, now) > cold_ttl:
+ idle_time = now - LAST_USED[sig]
+ logger.info(f"🧹 Closing cold browser (sig={sig[:8]}, idle={idle_time:.0f}s)")
+ with suppress(Exception):
+ await COLD_POOL[sig].close()
+ COLD_POOL.pop(sig, None)
+ LAST_USED.pop(sig, None)
+ USAGE_COUNT.pop(sig, None)
+
+ # Track in monitor
+ try:
+ from monitor import get_monitor
+ await get_monitor().track_janitor_event("close_cold", sig, {"idle_seconds": int(idle_time), "ttl": cold_ttl})
+ except:
+ pass
+
+ # Clean hot pool (more conservative)
+ for sig in list(HOT_POOL.keys()):
+ if now - LAST_USED.get(sig, now) > hot_ttl:
+ idle_time = now - LAST_USED[sig]
+ logger.info(f"🧹 Closing hot browser (sig={sig[:8]}, idle={idle_time:.0f}s)")
+ with suppress(Exception):
+ await HOT_POOL[sig].close()
+ HOT_POOL.pop(sig, None)
+ LAST_USED.pop(sig, None)
+ USAGE_COUNT.pop(sig, None)
+
+ # Track in monitor
+ try:
+ from monitor import get_monitor
+ await get_monitor().track_janitor_event("close_hot", sig, {"idle_seconds": int(idle_time), "ttl": hot_ttl})
+ except:
+ pass
+
+ # Log pool stats
+ if mem_pct > 60:
+ logger.info(f"📊 Pool: hot={len(HOT_POOL)}, cold={len(COLD_POOL)}, mem={mem_pct:.1f}%")
diff --git a/deploy/docker/hook_manager.py b/deploy/docker/hook_manager.py
new file mode 100644
index 00000000..41c4f25d
--- /dev/null
+++ b/deploy/docker/hook_manager.py
@@ -0,0 +1,512 @@
+"""
+Hook Manager for User-Provided Hook Functions
+Handles validation, compilation, and safe execution of user-provided hook code
+"""
+
+import ast
+import asyncio
+import traceback
+from typing import Dict, Callable, Optional, Tuple, List, Any
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class UserHookManager:
+ """Manages user-provided hook functions with error isolation"""
+
+ # Expected signatures for each hook point
+ HOOK_SIGNATURES = {
+ "on_browser_created": ["browser"],
+ "on_page_context_created": ["page", "context"],
+ "before_goto": ["page", "context", "url"],
+ "after_goto": ["page", "context", "url", "response"],
+ "on_user_agent_updated": ["page", "context", "user_agent"],
+ "on_execution_started": ["page", "context"],
+ "before_retrieve_html": ["page", "context"],
+ "before_return_html": ["page", "context", "html"]
+ }
+
+ # Default timeout for hook execution (in seconds)
+ DEFAULT_TIMEOUT = 30
+
+ def __init__(self, timeout: int = DEFAULT_TIMEOUT):
+ self.timeout = timeout
+ self.errors: List[Dict[str, Any]] = []
+ self.compiled_hooks: Dict[str, Callable] = {}
+ self.execution_log: List[Dict[str, Any]] = []
+
+ def validate_hook_structure(self, hook_code: str, hook_point: str) -> Tuple[bool, str]:
+ """
+ Validate the structure of user-provided hook code
+
+ Args:
+ hook_code: The Python code string containing the hook function
+ hook_point: The hook point name (e.g., 'on_page_context_created')
+
+ Returns:
+ Tuple of (is_valid, error_message)
+ """
+ try:
+ # Parse the code
+ tree = ast.parse(hook_code)
+
+ # Check if it's empty
+ if not tree.body:
+ return False, "Hook code is empty"
+
+ # Find the function definition
+ func_def = None
+ for node in tree.body:
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+ func_def = node
+ break
+
+ if not func_def:
+ return False, "Hook must contain a function definition (def or async def)"
+
+ # Check if it's async (all hooks should be async)
+ if not isinstance(func_def, ast.AsyncFunctionDef):
+ return False, f"Hook function must be async (use 'async def' instead of 'def')"
+
+ # Get function name for better error messages
+ func_name = func_def.name
+
+ # Validate parameters
+ expected_params = self.HOOK_SIGNATURES.get(hook_point, [])
+ if not expected_params:
+ return False, f"Unknown hook point: {hook_point}"
+
+ func_params = [arg.arg for arg in func_def.args.args]
+
+ # Check if it has **kwargs for flexibility
+ has_kwargs = func_def.args.kwarg is not None
+
+ # Must have at least the expected parameters
+ missing_params = []
+ for expected in expected_params:
+ if expected not in func_params:
+ missing_params.append(expected)
+
+ if missing_params and not has_kwargs:
+ return False, f"Hook function '{func_name}' must accept parameters: {', '.join(expected_params)} (missing: {', '.join(missing_params)})"
+
+ # Check if it returns something (should return page or browser)
+ has_return = any(isinstance(node, ast.Return) for node in ast.walk(func_def))
+ if not has_return:
+ # Warning, not error - we'll handle this
+ logger.warning(f"Hook function '{func_name}' should return the {expected_params[0]} object")
+
+ return True, "Valid"
+
+ except SyntaxError as e:
+ return False, f"Syntax error at line {e.lineno}: {str(e)}"
+ except Exception as e:
+ return False, f"Failed to parse hook code: {str(e)}"
+
+ def compile_hook(self, hook_code: str, hook_point: str) -> Optional[Callable]:
+ """
+ Compile user-provided hook code into a callable function
+
+ Args:
+ hook_code: The Python code string
+ hook_point: The hook point name
+
+ Returns:
+ Compiled function or None if compilation failed
+ """
+ try:
+ # Create a safe namespace for the hook
+ # Use a more complete builtins that includes __import__
+ import builtins
+ safe_builtins = {}
+
+ # Add safe built-in functions
+ allowed_builtins = [
+ 'print', 'len', 'str', 'int', 'float', 'bool',
+ 'list', 'dict', 'set', 'tuple', 'range', 'enumerate',
+ 'zip', 'map', 'filter', 'any', 'all', 'sum', 'min', 'max',
+ 'sorted', 'reversed', 'abs', 'round', 'isinstance', 'type',
+ 'getattr', 'hasattr', 'setattr', 'callable', 'iter', 'next',
+ '__import__', '__build_class__' # Required for exec
+ ]
+
+ for name in allowed_builtins:
+ if hasattr(builtins, name):
+ safe_builtins[name] = getattr(builtins, name)
+
+ namespace = {
+ '__name__': f'user_hook_{hook_point}',
+ '__builtins__': safe_builtins
+ }
+
+ # Add commonly needed imports
+ exec("import asyncio", namespace)
+ exec("import json", namespace)
+ exec("import re", namespace)
+ exec("from typing import Dict, List, Optional", namespace)
+
+ # Execute the code to define the function
+ exec(hook_code, namespace)
+
+ # Find the async function in the namespace
+ for name, obj in namespace.items():
+ if callable(obj) and not name.startswith('_') and asyncio.iscoroutinefunction(obj):
+ return obj
+
+ # If no async function found, look for any function
+ for name, obj in namespace.items():
+ if callable(obj) and not name.startswith('_'):
+ logger.warning(f"Found non-async function '{name}' - wrapping it")
+ # Wrap sync function in async
+ async def async_wrapper(*args, **kwargs):
+ return obj(*args, **kwargs)
+ return async_wrapper
+
+ raise ValueError("No callable function found in hook code")
+
+ except Exception as e:
+ error = {
+ 'hook_point': hook_point,
+ 'error': f"Failed to compile hook: {str(e)}",
+ 'type': 'compilation_error',
+ 'traceback': traceback.format_exc()
+ }
+ self.errors.append(error)
+ logger.error(f"Hook compilation failed for {hook_point}: {str(e)}")
+ return None
+
+ async def execute_hook_safely(
+ self,
+ hook_func: Callable,
+ hook_point: str,
+ *args,
+ **kwargs
+ ) -> Tuple[Any, Optional[Dict]]:
+ """
+ Execute a user hook with error isolation and timeout
+
+ Args:
+ hook_func: The compiled hook function
+ hook_point: The hook point name
+ *args, **kwargs: Arguments to pass to the hook
+
+ Returns:
+ Tuple of (result, error_dict)
+ """
+ start_time = asyncio.get_event_loop().time()
+
+ try:
+ # Add timeout to prevent infinite loops
+ result = await asyncio.wait_for(
+ hook_func(*args, **kwargs),
+ timeout=self.timeout
+ )
+
+ # Log successful execution
+ execution_time = asyncio.get_event_loop().time() - start_time
+ self.execution_log.append({
+ 'hook_point': hook_point,
+ 'status': 'success',
+ 'execution_time': execution_time,
+ 'timestamp': start_time
+ })
+
+ return result, None
+
+ except asyncio.TimeoutError:
+ error = {
+ 'hook_point': hook_point,
+ 'error': f'Hook execution timed out ({self.timeout}s limit)',
+ 'type': 'timeout',
+ 'execution_time': self.timeout
+ }
+ self.errors.append(error)
+ self.execution_log.append({
+ 'hook_point': hook_point,
+ 'status': 'timeout',
+ 'error': error['error'],
+ 'execution_time': self.timeout,
+ 'timestamp': start_time
+ })
+ # Return the first argument (usually page/browser) to continue
+ return args[0] if args else None, error
+
+ except Exception as e:
+ execution_time = asyncio.get_event_loop().time() - start_time
+ error = {
+ 'hook_point': hook_point,
+ 'error': str(e),
+ 'type': type(e).__name__,
+ 'traceback': traceback.format_exc(),
+ 'execution_time': execution_time
+ }
+ self.errors.append(error)
+ self.execution_log.append({
+ 'hook_point': hook_point,
+ 'status': 'failed',
+ 'error': str(e),
+ 'error_type': type(e).__name__,
+ 'execution_time': execution_time,
+ 'timestamp': start_time
+ })
+ # Return the first argument (usually page/browser) to continue
+ return args[0] if args else None, error
+
+ def get_summary(self) -> Dict[str, Any]:
+ """Get a summary of hook execution"""
+ total_hooks = len(self.execution_log)
+ successful = sum(1 for log in self.execution_log if log['status'] == 'success')
+ failed = sum(1 for log in self.execution_log if log['status'] == 'failed')
+ timed_out = sum(1 for log in self.execution_log if log['status'] == 'timeout')
+
+ return {
+ 'total_executions': total_hooks,
+ 'successful': successful,
+ 'failed': failed,
+ 'timed_out': timed_out,
+ 'success_rate': (successful / total_hooks * 100) if total_hooks > 0 else 0,
+ 'total_errors': len(self.errors)
+ }
+
+
+class IsolatedHookWrapper:
+ """Wraps user hooks with error isolation and reporting"""
+
+ def __init__(self, hook_manager: UserHookManager):
+ self.hook_manager = hook_manager
+
+ def create_hook_wrapper(self, user_hook: Callable, hook_point: str) -> Callable:
+ """
+ Create a wrapper that isolates hook errors from main process
+
+ Args:
+ user_hook: The compiled user hook function
+ hook_point: The hook point name
+
+ Returns:
+ Wrapped async function that handles errors gracefully
+ """
+
+ async def wrapped_hook(*args, **kwargs):
+ """Wrapped hook with error isolation"""
+ # Get the main return object (page/browser)
+ # This ensures we always have something to return
+ return_obj = None
+ if args:
+ return_obj = args[0]
+ elif 'page' in kwargs:
+ return_obj = kwargs['page']
+ elif 'browser' in kwargs:
+ return_obj = kwargs['browser']
+
+ try:
+ # Execute user hook with safety
+ result, error = await self.hook_manager.execute_hook_safely(
+ user_hook,
+ hook_point,
+ *args,
+ **kwargs
+ )
+
+ if error:
+ # Hook failed but we continue with original object
+ logger.warning(f"User hook failed at {hook_point}: {error['error']}")
+ return return_obj
+
+ # Hook succeeded - return its result or the original object
+ if result is None:
+ logger.debug(f"Hook at {hook_point} returned None, using original object")
+ return return_obj
+
+ return result
+
+ except Exception as e:
+ # This should rarely happen due to execute_hook_safely
+ logger.error(f"Unexpected error in hook wrapper for {hook_point}: {e}")
+ return return_obj
+
+ # Set function name for debugging
+ wrapped_hook.__name__ = f"wrapped_{hook_point}"
+ return wrapped_hook
+
+
+async def process_user_hooks(
+ hooks_input: Dict[str, str],
+ timeout: int = 30
+) -> Tuple[Dict[str, Callable], List[Dict], UserHookManager]:
+ """
+ Process and compile user-provided hook functions
+
+ Args:
+ hooks_input: Dictionary mapping hook points to code strings
+ timeout: Timeout for each hook execution
+
+ Returns:
+ Tuple of (compiled_hooks, validation_errors, hook_manager)
+ """
+
+ hook_manager = UserHookManager(timeout=timeout)
+ wrapper = IsolatedHookWrapper(hook_manager)
+ compiled_hooks = {}
+ validation_errors = []
+
+ for hook_point, hook_code in hooks_input.items():
+ # Skip empty hooks
+ if not hook_code or not hook_code.strip():
+ continue
+
+ # Validate hook point
+ if hook_point not in UserHookManager.HOOK_SIGNATURES:
+ validation_errors.append({
+ 'hook_point': hook_point,
+ 'error': f'Unknown hook point. Valid points: {", ".join(UserHookManager.HOOK_SIGNATURES.keys())}',
+ 'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
+ })
+ continue
+
+ # Validate structure
+ is_valid, message = hook_manager.validate_hook_structure(hook_code, hook_point)
+ if not is_valid:
+ validation_errors.append({
+ 'hook_point': hook_point,
+ 'error': message,
+ 'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
+ })
+ continue
+
+ # Compile the hook
+ hook_func = hook_manager.compile_hook(hook_code, hook_point)
+ if hook_func:
+ # Wrap with error isolation
+ wrapped_hook = wrapper.create_hook_wrapper(hook_func, hook_point)
+ compiled_hooks[hook_point] = wrapped_hook
+ logger.info(f"Successfully compiled hook for {hook_point}")
+ else:
+ validation_errors.append({
+ 'hook_point': hook_point,
+ 'error': 'Failed to compile hook function - check syntax and structure',
+ 'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
+ })
+
+ return compiled_hooks, validation_errors, hook_manager
+
+
+async def process_user_hooks_with_manager(
+ hooks_input: Dict[str, str],
+ hook_manager: UserHookManager
+) -> Tuple[Dict[str, Callable], List[Dict]]:
+ """
+ Process and compile user-provided hook functions with existing manager
+
+ Args:
+ hooks_input: Dictionary mapping hook points to code strings
+ hook_manager: Existing UserHookManager instance
+
+ Returns:
+ Tuple of (compiled_hooks, validation_errors)
+ """
+
+ wrapper = IsolatedHookWrapper(hook_manager)
+ compiled_hooks = {}
+ validation_errors = []
+
+ for hook_point, hook_code in hooks_input.items():
+ # Skip empty hooks
+ if not hook_code or not hook_code.strip():
+ continue
+
+ # Validate hook point
+ if hook_point not in UserHookManager.HOOK_SIGNATURES:
+ validation_errors.append({
+ 'hook_point': hook_point,
+ 'error': f'Unknown hook point. Valid points: {", ".join(UserHookManager.HOOK_SIGNATURES.keys())}',
+ 'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
+ })
+ continue
+
+ # Validate structure
+ is_valid, message = hook_manager.validate_hook_structure(hook_code, hook_point)
+ if not is_valid:
+ validation_errors.append({
+ 'hook_point': hook_point,
+ 'error': message,
+ 'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
+ })
+ continue
+
+ # Compile the hook
+ hook_func = hook_manager.compile_hook(hook_code, hook_point)
+ if hook_func:
+ # Wrap with error isolation
+ wrapped_hook = wrapper.create_hook_wrapper(hook_func, hook_point)
+ compiled_hooks[hook_point] = wrapped_hook
+ logger.info(f"Successfully compiled hook for {hook_point}")
+ else:
+ validation_errors.append({
+ 'hook_point': hook_point,
+ 'error': 'Failed to compile hook function - check syntax and structure',
+ 'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
+ })
+
+ return compiled_hooks, validation_errors
+
+
+async def attach_user_hooks_to_crawler(
+ crawler, # AsyncWebCrawler instance
+ user_hooks: Dict[str, str],
+ timeout: int = 30,
+ hook_manager: Optional[UserHookManager] = None
+) -> Tuple[Dict[str, Any], UserHookManager]:
+ """
+ Attach user-provided hooks to crawler with full error reporting
+
+ Args:
+ crawler: AsyncWebCrawler instance
+ user_hooks: Dictionary mapping hook points to code strings
+ timeout: Timeout for each hook execution
+ hook_manager: Optional existing UserHookManager instance
+
+ Returns:
+ Tuple of (status_dict, hook_manager)
+ """
+
+ # Use provided hook_manager or create a new one
+ if hook_manager is None:
+ hook_manager = UserHookManager(timeout=timeout)
+
+ # Process hooks with the hook_manager
+ compiled_hooks, validation_errors = await process_user_hooks_with_manager(
+ user_hooks, hook_manager
+ )
+
+ # Log validation errors
+ if validation_errors:
+ logger.warning(f"Hook validation errors: {validation_errors}")
+
+ # Attach successfully compiled hooks
+ attached_hooks = []
+ for hook_point, wrapped_hook in compiled_hooks.items():
+ try:
+ crawler.crawler_strategy.set_hook(hook_point, wrapped_hook)
+ attached_hooks.append(hook_point)
+ logger.info(f"Attached hook to {hook_point}")
+ except Exception as e:
+ logger.error(f"Failed to attach hook to {hook_point}: {e}")
+ validation_errors.append({
+ 'hook_point': hook_point,
+ 'error': f'Failed to attach hook: {str(e)}'
+ })
+
+ status = 'success' if not validation_errors else ('partial' if attached_hooks else 'failed')
+
+ status_dict = {
+ 'status': status,
+ 'attached_hooks': attached_hooks,
+ 'validation_errors': validation_errors,
+ 'total_hooks_provided': len(user_hooks),
+ 'successfully_attached': len(attached_hooks),
+ 'failed_validation': len(validation_errors)
+ }
+
+ return status_dict, hook_manager
\ No newline at end of file
diff --git a/deploy/docker/job.py b/deploy/docker/job.py
index 10d83fdd..8fae16cd 100644
--- a/deploy/docker/job.py
+++ b/deploy/docker/job.py
@@ -12,6 +12,7 @@ from api import (
handle_crawl_job,
handle_task_status,
)
+from schemas import WebhookConfig
# ------------- dependency placeholders -------------
_redis = None # will be injected from server.py
@@ -37,12 +38,16 @@ class LlmJobPayload(BaseModel):
schema: Optional[str] = None
cache: bool = False
provider: Optional[str] = None
+ webhook_config: Optional[WebhookConfig] = None
+ temperature: Optional[float] = None
+ base_url: Optional[str] = None
class CrawlJobPayload(BaseModel):
urls: list[HttpUrl]
browser_config: Dict = {}
crawler_config: Dict = {}
+ webhook_config: Optional[WebhookConfig] = None
# ---------- LLM job ---------------------------------------------------------
@@ -53,6 +58,10 @@ async def llm_job_enqueue(
request: Request,
_td: Dict = Depends(lambda: _token_dep()), # late-bound dep
):
+ webhook_config = None
+ if payload.webhook_config:
+ webhook_config = payload.webhook_config.model_dump(mode='json')
+
return await handle_llm_request(
_redis,
background_tasks,
@@ -63,6 +72,9 @@ async def llm_job_enqueue(
cache=payload.cache,
config=_config,
provider=payload.provider,
+ webhook_config=webhook_config,
+ temperature=payload.temperature,
+ api_base_url=payload.base_url,
)
@@ -72,7 +84,7 @@ async def llm_job_status(
task_id: str,
_td: Dict = Depends(lambda: _token_dep())
):
- return await handle_task_status(_redis, task_id)
+ return await handle_task_status(_redis, task_id, base_url=str(request.base_url))
# ---------- CRAWL job -------------------------------------------------------
@@ -82,6 +94,10 @@ async def crawl_job_enqueue(
background_tasks: BackgroundTasks,
_td: Dict = Depends(lambda: _token_dep()),
):
+ webhook_config = None
+ if payload.webhook_config:
+ webhook_config = payload.webhook_config.model_dump(mode='json')
+
return await handle_crawl_job(
_redis,
background_tasks,
@@ -89,6 +105,7 @@ async def crawl_job_enqueue(
payload.browser_config,
payload.crawler_config,
config=_config,
+ webhook_config=webhook_config,
)
diff --git a/deploy/docker/monitor.py b/deploy/docker/monitor.py
new file mode 100644
index 00000000..469ec36c
--- /dev/null
+++ b/deploy/docker/monitor.py
@@ -0,0 +1,382 @@
+# monitor.py - Real-time monitoring stats with Redis persistence
+import time
+import json
+import asyncio
+from typing import Dict, List, Optional
+from datetime import datetime, timezone
+from collections import deque
+from redis import asyncio as aioredis
+from utils import get_container_memory_percent
+import psutil
+import logging
+
+logger = logging.getLogger(__name__)
+
+class MonitorStats:
+ """Tracks real-time server stats with Redis persistence."""
+
+ def __init__(self, redis: aioredis.Redis):
+ self.redis = redis
+ self.start_time = time.time()
+
+ # In-memory queues (fast reads, Redis backup)
+ self.active_requests: Dict[str, Dict] = {} # id -> request info
+ self.completed_requests: deque = deque(maxlen=100) # Last 100
+ self.janitor_events: deque = deque(maxlen=100)
+ self.errors: deque = deque(maxlen=100)
+
+ # Endpoint stats (persisted in Redis)
+ self.endpoint_stats: Dict[str, Dict] = {} # endpoint -> {count, total_time, errors, ...}
+
+ # Background persistence queue (max 10 pending persist requests)
+ self._persist_queue: asyncio.Queue = asyncio.Queue(maxsize=10)
+ self._persist_worker_task: Optional[asyncio.Task] = None
+
+ # Timeline data (5min window, 5s resolution = 60 points)
+ self.memory_timeline: deque = deque(maxlen=60)
+ self.requests_timeline: deque = deque(maxlen=60)
+ self.browser_timeline: deque = deque(maxlen=60)
+
+ async def track_request_start(self, request_id: str, endpoint: str, url: str, config: Dict = None):
+ """Track new request start."""
+ req_info = {
+ "id": request_id,
+ "endpoint": endpoint,
+ "url": url[:100], # Truncate long URLs
+ "start_time": time.time(),
+ "config_sig": config.get("sig", "default") if config else "default",
+ "mem_start": psutil.Process().memory_info().rss / (1024 * 1024)
+ }
+ self.active_requests[request_id] = req_info
+
+ # Increment endpoint counter
+ if endpoint not in self.endpoint_stats:
+ self.endpoint_stats[endpoint] = {
+ "count": 0, "total_time": 0, "errors": 0,
+ "pool_hits": 0, "success": 0
+ }
+ self.endpoint_stats[endpoint]["count"] += 1
+
+ # Queue persistence (handled by background worker)
+ try:
+ self._persist_queue.put_nowait(True)
+ except asyncio.QueueFull:
+ logger.warning("Persistence queue full, skipping")
+
+ async def track_request_end(self, request_id: str, success: bool, error: str = None,
+ pool_hit: bool = True, status_code: int = 200):
+ """Track request completion."""
+ if request_id not in self.active_requests:
+ return
+
+ req_info = self.active_requests.pop(request_id)
+ end_time = time.time()
+ elapsed = end_time - req_info["start_time"]
+ mem_end = psutil.Process().memory_info().rss / (1024 * 1024)
+ mem_delta = mem_end - req_info["mem_start"]
+
+ # Update stats
+ endpoint = req_info["endpoint"]
+ if endpoint in self.endpoint_stats:
+ self.endpoint_stats[endpoint]["total_time"] += elapsed
+ if success:
+ self.endpoint_stats[endpoint]["success"] += 1
+ else:
+ self.endpoint_stats[endpoint]["errors"] += 1
+ if pool_hit:
+ self.endpoint_stats[endpoint]["pool_hits"] += 1
+
+ # Add to completed queue
+ completed = {
+ **req_info,
+ "end_time": end_time,
+ "elapsed": round(elapsed, 2),
+ "mem_delta": round(mem_delta, 1),
+ "success": success,
+ "error": error,
+ "status_code": status_code,
+ "pool_hit": pool_hit
+ }
+ self.completed_requests.append(completed)
+
+ # Track errors
+ if not success and error:
+ self.errors.append({
+ "timestamp": end_time,
+ "endpoint": endpoint,
+ "url": req_info["url"],
+ "error": error,
+ "request_id": request_id
+ })
+
+ await self._persist_endpoint_stats()
+
+ async def track_janitor_event(self, event_type: str, sig: str, details: Dict):
+ """Track janitor cleanup events."""
+ self.janitor_events.append({
+ "timestamp": time.time(),
+ "type": event_type, # "close_cold", "close_hot", "promote"
+ "sig": sig[:8],
+ "details": details
+ })
+
+ def _cleanup_old_entries(self, max_age_seconds: int = 300):
+ """Remove entries older than max_age_seconds (default 5min)."""
+ now = time.time()
+ cutoff = now - max_age_seconds
+
+ # Clean completed requests
+ while self.completed_requests and self.completed_requests[0].get("end_time", 0) < cutoff:
+ self.completed_requests.popleft()
+
+ # Clean janitor events
+ while self.janitor_events and self.janitor_events[0].get("timestamp", 0) < cutoff:
+ self.janitor_events.popleft()
+
+ # Clean errors
+ while self.errors and self.errors[0].get("timestamp", 0) < cutoff:
+ self.errors.popleft()
+
+ async def update_timeline(self):
+ """Update timeline data points (called every 5s)."""
+ now = time.time()
+ mem_pct = get_container_memory_percent()
+
+ # Clean old entries (keep last 5 minutes)
+ self._cleanup_old_entries(max_age_seconds=300)
+
+ # Count requests in last 5s
+ recent_reqs = sum(1 for req in self.completed_requests
+ if now - req.get("end_time", 0) < 5)
+
+ # Browser counts (acquire lock to prevent race conditions)
+ from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LOCK
+ async with LOCK:
+ browser_count = {
+ "permanent": 1 if PERMANENT else 0,
+ "hot": len(HOT_POOL),
+ "cold": len(COLD_POOL)
+ }
+
+ self.memory_timeline.append({"time": now, "value": mem_pct})
+ self.requests_timeline.append({"time": now, "value": recent_reqs})
+ self.browser_timeline.append({"time": now, "browsers": browser_count})
+
+ async def _persist_endpoint_stats(self):
+ """Persist endpoint stats to Redis."""
+ try:
+ await self.redis.set(
+ "monitor:endpoint_stats",
+ json.dumps(self.endpoint_stats),
+ ex=86400 # 24h TTL
+ )
+ except Exception as e:
+ logger.warning(f"Failed to persist endpoint stats: {e}")
+
+ async def _persistence_worker(self):
+ """Background worker to persist stats to Redis."""
+ while True:
+ try:
+ await self._persist_queue.get()
+ await self._persist_endpoint_stats()
+ self._persist_queue.task_done()
+ except asyncio.CancelledError:
+ break
+ except Exception as e:
+ logger.error(f"Persistence worker error: {e}")
+
+ def start_persistence_worker(self):
+ """Start the background persistence worker."""
+ if not self._persist_worker_task:
+ self._persist_worker_task = asyncio.create_task(self._persistence_worker())
+ logger.info("Started persistence worker")
+
+ async def stop_persistence_worker(self):
+ """Stop the background persistence worker."""
+ if self._persist_worker_task:
+ self._persist_worker_task.cancel()
+ try:
+ await self._persist_worker_task
+ except asyncio.CancelledError:
+ pass
+ self._persist_worker_task = None
+ logger.info("Stopped persistence worker")
+
+ async def cleanup(self):
+ """Cleanup on shutdown - persist final stats and stop workers."""
+ logger.info("Monitor cleanup starting...")
+ try:
+ # Persist final stats before shutdown
+ await self._persist_endpoint_stats()
+ # Stop background worker
+ await self.stop_persistence_worker()
+ logger.info("Monitor cleanup completed")
+ except Exception as e:
+ logger.error(f"Monitor cleanup error: {e}")
+
+ async def load_from_redis(self):
+ """Load persisted stats from Redis."""
+ try:
+ data = await self.redis.get("monitor:endpoint_stats")
+ if data:
+ self.endpoint_stats = json.loads(data)
+ logger.info("Loaded endpoint stats from Redis")
+ except Exception as e:
+ logger.warning(f"Failed to load from Redis: {e}")
+
+ async def get_health_summary(self) -> Dict:
+ """Get current system health snapshot."""
+ mem_pct = get_container_memory_percent()
+ cpu_pct = psutil.cpu_percent(interval=0.1)
+
+ # Network I/O (delta since last call)
+ net = psutil.net_io_counters()
+
+ # Pool status (acquire lock to prevent race conditions)
+ from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LOCK
+ async with LOCK:
+ # TODO: Track actual browser process memory instead of estimates
+ # These are conservative estimates based on typical Chromium usage
+ permanent_mem = 270 if PERMANENT else 0 # Estimate: ~270MB for permanent browser
+ hot_mem = len(HOT_POOL) * 180 # Estimate: ~180MB per hot pool browser
+ cold_mem = len(COLD_POOL) * 180 # Estimate: ~180MB per cold pool browser
+ permanent_active = PERMANENT is not None
+ hot_count = len(HOT_POOL)
+ cold_count = len(COLD_POOL)
+
+ return {
+ "container": {
+ "memory_percent": round(mem_pct, 1),
+ "cpu_percent": round(cpu_pct, 1),
+ "network_sent_mb": round(net.bytes_sent / (1024**2), 2),
+ "network_recv_mb": round(net.bytes_recv / (1024**2), 2),
+ "uptime_seconds": int(time.time() - self.start_time)
+ },
+ "pool": {
+ "permanent": {"active": permanent_active, "memory_mb": permanent_mem},
+ "hot": {"count": hot_count, "memory_mb": hot_mem},
+ "cold": {"count": cold_count, "memory_mb": cold_mem},
+ "total_memory_mb": permanent_mem + hot_mem + cold_mem
+ },
+ "janitor": {
+ "next_cleanup_estimate": "adaptive", # Would need janitor state
+ "memory_pressure": "LOW" if mem_pct < 60 else "MEDIUM" if mem_pct < 80 else "HIGH"
+ }
+ }
+
+ def get_active_requests(self) -> List[Dict]:
+ """Get list of currently active requests."""
+ now = time.time()
+ return [
+ {
+ **req,
+ "elapsed": round(now - req["start_time"], 1),
+ "status": "running"
+ }
+ for req in self.active_requests.values()
+ ]
+
+ def get_completed_requests(self, limit: int = 50, filter_status: str = "all") -> List[Dict]:
+ """Get recent completed requests."""
+ requests = list(self.completed_requests)[-limit:]
+ if filter_status == "success":
+ requests = [r for r in requests if r.get("success")]
+ elif filter_status == "error":
+ requests = [r for r in requests if not r.get("success")]
+ return requests
+
+ async def get_browser_list(self) -> List[Dict]:
+ """Get detailed browser pool information."""
+ from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LAST_USED, USAGE_COUNT, DEFAULT_CONFIG_SIG, LOCK
+
+ browsers = []
+ now = time.time()
+
+ # Acquire lock to prevent race conditions during iteration
+ async with LOCK:
+ if PERMANENT:
+ browsers.append({
+ "type": "permanent",
+ "sig": DEFAULT_CONFIG_SIG[:8] if DEFAULT_CONFIG_SIG else "unknown",
+ "age_seconds": int(now - self.start_time),
+ "last_used_seconds": int(now - LAST_USED.get(DEFAULT_CONFIG_SIG, now)),
+ "memory_mb": 270,
+ "hits": USAGE_COUNT.get(DEFAULT_CONFIG_SIG, 0),
+ "killable": False
+ })
+
+ for sig, crawler in HOT_POOL.items():
+ browsers.append({
+ "type": "hot",
+ "sig": sig[:8],
+ "age_seconds": int(now - self.start_time), # Approximation
+ "last_used_seconds": int(now - LAST_USED.get(sig, now)),
+ "memory_mb": 180, # Estimate
+ "hits": USAGE_COUNT.get(sig, 0),
+ "killable": True
+ })
+
+ for sig, crawler in COLD_POOL.items():
+ browsers.append({
+ "type": "cold",
+ "sig": sig[:8],
+ "age_seconds": int(now - self.start_time),
+ "last_used_seconds": int(now - LAST_USED.get(sig, now)),
+ "memory_mb": 180,
+ "hits": USAGE_COUNT.get(sig, 0),
+ "killable": True
+ })
+
+ return browsers
+
+ def get_endpoint_stats_summary(self) -> Dict[str, Dict]:
+ """Get aggregated endpoint statistics."""
+ summary = {}
+ for endpoint, stats in self.endpoint_stats.items():
+ count = stats["count"]
+ avg_time = (stats["total_time"] / count) if count > 0 else 0
+ success_rate = (stats["success"] / count * 100) if count > 0 else 0
+ pool_hit_rate = (stats["pool_hits"] / count * 100) if count > 0 else 0
+
+ summary[endpoint] = {
+ "count": count,
+ "avg_latency_ms": round(avg_time * 1000, 1),
+ "success_rate_percent": round(success_rate, 1),
+ "pool_hit_rate_percent": round(pool_hit_rate, 1),
+ "errors": stats["errors"]
+ }
+ return summary
+
+ def get_timeline_data(self, metric: str, window: str = "5m") -> Dict:
+ """Get timeline data for charts."""
+ # For now, only 5m window supported
+ if metric == "memory":
+ data = list(self.memory_timeline)
+ elif metric == "requests":
+ data = list(self.requests_timeline)
+ elif metric == "browsers":
+ data = list(self.browser_timeline)
+ else:
+ return {"timestamps": [], "values": []}
+
+ return {
+ "timestamps": [int(d["time"]) for d in data],
+ "values": [d.get("value", d.get("browsers")) for d in data]
+ }
+
+ def get_janitor_log(self, limit: int = 100) -> List[Dict]:
+ """Get recent janitor events."""
+ return list(self.janitor_events)[-limit:]
+
+ def get_errors_log(self, limit: int = 100) -> List[Dict]:
+ """Get recent errors."""
+ return list(self.errors)[-limit:]
+
+# Global instance (initialized in server.py)
+monitor_stats: Optional[MonitorStats] = None
+
+def get_monitor() -> MonitorStats:
+ """Get global monitor instance."""
+ if monitor_stats is None:
+ raise RuntimeError("Monitor not initialized")
+ return monitor_stats
diff --git a/deploy/docker/monitor_routes.py b/deploy/docker/monitor_routes.py
new file mode 100644
index 00000000..fdf156de
--- /dev/null
+++ b/deploy/docker/monitor_routes.py
@@ -0,0 +1,405 @@
+# monitor_routes.py - Monitor API endpoints
+from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect
+from pydantic import BaseModel
+from typing import Optional
+from monitor import get_monitor
+import logging
+import asyncio
+import json
+
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/monitor", tags=["monitor"])
+
+
+@router.get("/health")
+async def get_health():
+ """Get current system health snapshot."""
+ try:
+ monitor = get_monitor()
+ return await monitor.get_health_summary()
+ except Exception as e:
+ logger.error(f"Error getting health: {e}")
+ raise HTTPException(500, str(e))
+
+
+@router.get("/requests")
+async def get_requests(status: str = "all", limit: int = 50):
+ """Get active and completed requests.
+
+ Args:
+ status: Filter by 'active', 'completed', 'success', 'error', or 'all'
+ limit: Max number of completed requests to return (default 50)
+ """
+ # Input validation
+ if status not in ["all", "active", "completed", "success", "error"]:
+ raise HTTPException(400, f"Invalid status: {status}. Must be one of: all, active, completed, success, error")
+ if limit < 1 or limit > 1000:
+ raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
+
+ try:
+ monitor = get_monitor()
+
+ if status == "active":
+ return {"active": monitor.get_active_requests(), "completed": []}
+ elif status == "completed":
+ return {"active": [], "completed": monitor.get_completed_requests(limit)}
+ elif status in ["success", "error"]:
+ return {"active": [], "completed": monitor.get_completed_requests(limit, status)}
+ else: # "all"
+ return {
+ "active": monitor.get_active_requests(),
+ "completed": monitor.get_completed_requests(limit)
+ }
+ except Exception as e:
+ logger.error(f"Error getting requests: {e}")
+ raise HTTPException(500, str(e))
+
+
+@router.get("/browsers")
+async def get_browsers():
+ """Get detailed browser pool information."""
+ try:
+ monitor = get_monitor()
+ browsers = await monitor.get_browser_list()
+
+ # Calculate summary stats
+ total_browsers = len(browsers)
+ total_memory = sum(b["memory_mb"] for b in browsers)
+
+ # Calculate reuse rate from recent requests
+ recent = monitor.get_completed_requests(100)
+ pool_hits = sum(1 for r in recent if r.get("pool_hit", False))
+ reuse_rate = (pool_hits / len(recent) * 100) if recent else 0
+
+ return {
+ "browsers": browsers,
+ "summary": {
+ "total_count": total_browsers,
+ "total_memory_mb": total_memory,
+ "reuse_rate_percent": round(reuse_rate, 1)
+ }
+ }
+ except Exception as e:
+ logger.error(f"Error getting browsers: {e}")
+ raise HTTPException(500, str(e))
+
+
+@router.get("/endpoints/stats")
+async def get_endpoint_stats():
+ """Get aggregated endpoint statistics."""
+ try:
+ monitor = get_monitor()
+ return monitor.get_endpoint_stats_summary()
+ except Exception as e:
+ logger.error(f"Error getting endpoint stats: {e}")
+ raise HTTPException(500, str(e))
+
+
+@router.get("/timeline")
+async def get_timeline(metric: str = "memory", window: str = "5m"):
+ """Get timeline data for charts.
+
+ Args:
+ metric: 'memory', 'requests', or 'browsers'
+ window: Time window (only '5m' supported for now)
+ """
+ # Input validation
+ if metric not in ["memory", "requests", "browsers"]:
+ raise HTTPException(400, f"Invalid metric: {metric}. Must be one of: memory, requests, browsers")
+ if window != "5m":
+ raise HTTPException(400, f"Invalid window: {window}. Only '5m' is currently supported")
+
+ try:
+ monitor = get_monitor()
+ return monitor.get_timeline_data(metric, window)
+ except Exception as e:
+ logger.error(f"Error getting timeline: {e}")
+ raise HTTPException(500, str(e))
+
+
+@router.get("/logs/janitor")
+async def get_janitor_log(limit: int = 100):
+ """Get recent janitor cleanup events."""
+ # Input validation
+ if limit < 1 or limit > 1000:
+ raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
+
+ try:
+ monitor = get_monitor()
+ return {"events": monitor.get_janitor_log(limit)}
+ except Exception as e:
+ logger.error(f"Error getting janitor log: {e}")
+ raise HTTPException(500, str(e))
+
+
+@router.get("/logs/errors")
+async def get_errors_log(limit: int = 100):
+ """Get recent errors."""
+ # Input validation
+ if limit < 1 or limit > 1000:
+ raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
+
+ try:
+ monitor = get_monitor()
+ return {"errors": monitor.get_errors_log(limit)}
+ except Exception as e:
+ logger.error(f"Error getting errors log: {e}")
+ raise HTTPException(500, str(e))
+
+
+# ========== Control Actions ==========
+
+class KillBrowserRequest(BaseModel):
+ sig: str
+
+
+@router.post("/actions/cleanup")
+async def force_cleanup():
+ """Force immediate janitor cleanup (kills idle cold pool browsers)."""
+ try:
+ from crawler_pool import COLD_POOL, LAST_USED, USAGE_COUNT, LOCK
+ import time
+ from contextlib import suppress
+
+ killed_count = 0
+ now = time.time()
+
+ async with LOCK:
+ for sig in list(COLD_POOL.keys()):
+ # Kill all cold pool browsers immediately
+ logger.info(f"🧹 Force cleanup: closing cold browser (sig={sig[:8]})")
+ with suppress(Exception):
+ await COLD_POOL[sig].close()
+ COLD_POOL.pop(sig, None)
+ LAST_USED.pop(sig, None)
+ USAGE_COUNT.pop(sig, None)
+ killed_count += 1
+
+ monitor = get_monitor()
+ await monitor.track_janitor_event("force_cleanup", "manual", {"killed": killed_count})
+
+ return {"success": True, "killed_browsers": killed_count}
+ except Exception as e:
+ logger.error(f"Error during force cleanup: {e}")
+ raise HTTPException(500, str(e))
+
+
+@router.post("/actions/kill_browser")
+async def kill_browser(req: KillBrowserRequest):
+ """Kill a specific browser by signature (hot or cold only).
+
+ Args:
+ sig: Browser config signature (first 8 chars)
+ """
+ try:
+ from crawler_pool import HOT_POOL, COLD_POOL, LAST_USED, USAGE_COUNT, LOCK, DEFAULT_CONFIG_SIG
+ from contextlib import suppress
+
+ # Find full signature matching prefix
+ target_sig = None
+ pool_type = None
+
+ async with LOCK:
+ # Check hot pool
+ for sig in HOT_POOL.keys():
+ if sig.startswith(req.sig):
+ target_sig = sig
+ pool_type = "hot"
+ break
+
+ # Check cold pool
+ if not target_sig:
+ for sig in COLD_POOL.keys():
+ if sig.startswith(req.sig):
+ target_sig = sig
+ pool_type = "cold"
+ break
+
+ # Check if trying to kill permanent
+ if DEFAULT_CONFIG_SIG and DEFAULT_CONFIG_SIG.startswith(req.sig):
+ raise HTTPException(403, "Cannot kill permanent browser. Use restart instead.")
+
+ if not target_sig:
+ raise HTTPException(404, f"Browser with sig={req.sig} not found")
+
+ # Warn if there are active requests (browser might be in use)
+ monitor = get_monitor()
+ active_count = len(monitor.get_active_requests())
+ if active_count > 0:
+ logger.warning(f"Killing browser {target_sig[:8]} while {active_count} requests are active - may cause failures")
+
+ # Kill the browser
+ if pool_type == "hot":
+ browser = HOT_POOL.pop(target_sig)
+ else:
+ browser = COLD_POOL.pop(target_sig)
+
+ with suppress(Exception):
+ await browser.close()
+
+ LAST_USED.pop(target_sig, None)
+ USAGE_COUNT.pop(target_sig, None)
+
+ logger.info(f"🔪 Killed {pool_type} browser (sig={target_sig[:8]})")
+
+ monitor = get_monitor()
+ await monitor.track_janitor_event("kill_browser", target_sig, {"pool": pool_type, "manual": True})
+
+ return {"success": True, "killed_sig": target_sig[:8], "pool_type": pool_type}
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"Error killing browser: {e}")
+ raise HTTPException(500, str(e))
+
+
+@router.post("/actions/restart_browser")
+async def restart_browser(req: KillBrowserRequest):
+ """Restart a browser (kill + recreate). Works for permanent too.
+
+ Args:
+ sig: Browser config signature (first 8 chars), or "permanent"
+ """
+ try:
+ from crawler_pool import (PERMANENT, HOT_POOL, COLD_POOL, LAST_USED,
+ USAGE_COUNT, LOCK, DEFAULT_CONFIG_SIG, init_permanent)
+ from crawl4ai import AsyncWebCrawler, BrowserConfig
+ from contextlib import suppress
+ import time
+
+ # Handle permanent browser restart
+ if req.sig == "permanent" or (DEFAULT_CONFIG_SIG and DEFAULT_CONFIG_SIG.startswith(req.sig)):
+ async with LOCK:
+ if PERMANENT:
+ with suppress(Exception):
+ await PERMANENT.close()
+
+ # Reinitialize permanent
+ from utils import load_config
+ config = load_config()
+ await init_permanent(BrowserConfig(
+ extra_args=config["crawler"]["browser"].get("extra_args", []),
+ **config["crawler"]["browser"].get("kwargs", {}),
+ ))
+
+ logger.info("🔄 Restarted permanent browser")
+ return {"success": True, "restarted": "permanent"}
+
+ # Handle hot/cold browser restart
+ target_sig = None
+ pool_type = None
+ browser_config = None
+
+ async with LOCK:
+ # Find browser
+ for sig in HOT_POOL.keys():
+ if sig.startswith(req.sig):
+ target_sig = sig
+ pool_type = "hot"
+ # Would need to reconstruct config (not stored currently)
+ break
+
+ if not target_sig:
+ for sig in COLD_POOL.keys():
+ if sig.startswith(req.sig):
+ target_sig = sig
+ pool_type = "cold"
+ break
+
+ if not target_sig:
+ raise HTTPException(404, f"Browser with sig={req.sig} not found")
+
+ # Kill existing
+ if pool_type == "hot":
+ browser = HOT_POOL.pop(target_sig)
+ else:
+ browser = COLD_POOL.pop(target_sig)
+
+ with suppress(Exception):
+ await browser.close()
+
+ # Note: We can't easily recreate with same config without storing it
+ # For now, just kill and let new requests create fresh ones
+ LAST_USED.pop(target_sig, None)
+ USAGE_COUNT.pop(target_sig, None)
+
+ logger.info(f"🔄 Restarted {pool_type} browser (sig={target_sig[:8]})")
+
+ monitor = get_monitor()
+ await monitor.track_janitor_event("restart_browser", target_sig, {"pool": pool_type})
+
+ return {"success": True, "restarted_sig": target_sig[:8], "note": "Browser will be recreated on next request"}
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"Error restarting browser: {e}")
+ raise HTTPException(500, str(e))
+
+
+@router.post("/stats/reset")
+async def reset_stats():
+ """Reset today's endpoint counters."""
+ try:
+ monitor = get_monitor()
+ monitor.endpoint_stats.clear()
+ await monitor._persist_endpoint_stats()
+
+ return {"success": True, "message": "Endpoint stats reset"}
+ except Exception as e:
+ logger.error(f"Error resetting stats: {e}")
+ raise HTTPException(500, str(e))
+
+
+@router.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+ """WebSocket endpoint for real-time monitoring updates.
+
+ Sends updates every 2 seconds with:
+ - Health stats
+ - Active/completed requests
+ - Browser pool status
+ - Timeline data
+ """
+ await websocket.accept()
+ logger.info("WebSocket client connected")
+
+ try:
+ while True:
+ try:
+ # Gather all monitoring data
+ monitor = get_monitor()
+
+ data = {
+ "timestamp": asyncio.get_event_loop().time(),
+ "health": await monitor.get_health_summary(),
+ "requests": {
+ "active": monitor.get_active_requests(),
+ "completed": monitor.get_completed_requests(limit=10)
+ },
+ "browsers": await monitor.get_browser_list(),
+ "timeline": {
+ "memory": monitor.get_timeline_data("memory", "5m"),
+ "requests": monitor.get_timeline_data("requests", "5m"),
+ "browsers": monitor.get_timeline_data("browsers", "5m")
+ },
+ "janitor": monitor.get_janitor_log(limit=10),
+ "errors": monitor.get_errors_log(limit=10)
+ }
+
+ # Send update to client
+ await websocket.send_json(data)
+
+ # Wait 2 seconds before next update
+ await asyncio.sleep(2)
+
+ except WebSocketDisconnect:
+ logger.info("WebSocket client disconnected")
+ break
+ except Exception as e:
+ logger.error(f"WebSocket error: {e}", exc_info=True)
+ await asyncio.sleep(2) # Continue trying
+
+ except Exception as e:
+ logger.error(f"WebSocket connection error: {e}", exc_info=True)
+ finally:
+ logger.info("WebSocket connection closed")
diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt
index d463c641..b33c081f 100644
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -12,6 +12,6 @@ pydantic>=2.11
rank-bm25==0.2.2
anyio==4.9.0
PyJWT==2.10.1
-mcp>=1.6.0
+mcp>=1.18.0
websockets>=15.0.1
httpx[http2]>=0.27.2
diff --git a/deploy/docker/schemas.py b/deploy/docker/schemas.py
index 96196633..21d47fc4 100644
--- a/deploy/docker/schemas.py
+++ b/deploy/docker/schemas.py
@@ -1,6 +1,6 @@
from typing import List, Optional, Dict
from enum import Enum
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, HttpUrl
from utils import FilterType
@@ -9,6 +9,50 @@ class CrawlRequest(BaseModel):
browser_config: Optional[Dict] = Field(default_factory=dict)
crawler_config: Optional[Dict] = Field(default_factory=dict)
+
+class HookConfig(BaseModel):
+ """Configuration for user-provided hooks"""
+ code: Dict[str, str] = Field(
+ default_factory=dict,
+ description="Map of hook points to Python code strings"
+ )
+ timeout: int = Field(
+ default=30,
+ ge=1,
+ le=120,
+ description="Timeout in seconds for each hook execution"
+ )
+
+ class Config:
+ schema_extra = {
+ "example": {
+ "code": {
+ "on_page_context_created": """
+async def hook(page, context, **kwargs):
+ # Block images to speed up crawling
+ await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
+ return page
+""",
+ "before_retrieve_html": """
+async def hook(page, context, **kwargs):
+ # Scroll to load lazy content
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+ await page.wait_for_timeout(2000)
+ return page
+"""
+ },
+ "timeout": 30
+ }
+ }
+
+
+class CrawlRequestWithHooks(CrawlRequest):
+ """Extended crawl request with hooks support"""
+ hooks: Optional[HookConfig] = Field(
+ default=None,
+ description="Optional user-provided hook functions"
+ )
+
class MarkdownRequest(BaseModel):
"""Request body for the /md endpoint."""
url: str = Field(..., description="Absolute http/https URL to fetch")
@@ -16,6 +60,8 @@ class MarkdownRequest(BaseModel):
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
+ temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)")
+ base_url: Optional[str] = Field(None, description="LLM API base URL override")
class RawCode(BaseModel):
@@ -39,4 +85,22 @@ class JSEndpointRequest(BaseModel):
scripts: List[str] = Field(
...,
description="List of separated JavaScript snippets to execute"
- )
\ No newline at end of file
+ )
+
+
+class WebhookConfig(BaseModel):
+ """Configuration for webhook notifications."""
+ webhook_url: HttpUrl
+ webhook_data_in_payload: bool = False
+ webhook_headers: Optional[Dict[str, str]] = None
+
+
+class WebhookPayload(BaseModel):
+ """Payload sent to webhook endpoints."""
+ task_id: str
+ task_type: str # "crawl", "llm_extraction", etc.
+ status: str # "completed" or "failed"
+ timestamp: str # ISO 8601 format
+ urls: List[str]
+ error: Optional[str] = None
+ data: Optional[Dict] = None # Included only if webhook_data_in_payload=True
\ No newline at end of file
diff --git a/deploy/docker/server.py b/deploy/docker/server.py
index 57fd3d6d..62e4e441 100644
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -16,6 +16,7 @@ from fastapi import Request, Depends
from fastapi.responses import FileResponse
import base64
import re
+import logging
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from api import (
handle_markdown_request, handle_llm_qa,
@@ -23,7 +24,7 @@ from api import (
stream_results
)
from schemas import (
- CrawlRequest,
+ CrawlRequestWithHooks,
MarkdownRequest,
RawCode,
HTMLRequest,
@@ -78,6 +79,14 @@ __version__ = "0.5.1-d1"
MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30)
GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
+# ── default browser config helper ─────────────────────────────
+def get_default_browser_config() -> BrowserConfig:
+ """Get default BrowserConfig from config.yml."""
+ return BrowserConfig(
+ extra_args=config["crawler"]["browser"].get("extra_args", []),
+ **config["crawler"]["browser"].get("kwargs", {}),
+ )
+
# import logging
# page_log = logging.getLogger("page_cap")
# orig_arun = AsyncWebCrawler.arun
@@ -103,15 +112,52 @@ AsyncWebCrawler.arun = capped_arun
@asynccontextmanager
async def lifespan(_: FastAPI):
- await get_crawler(BrowserConfig(
+ from crawler_pool import init_permanent
+ from monitor import MonitorStats
+ import monitor as monitor_module
+
+ # Initialize monitor
+ monitor_module.monitor_stats = MonitorStats(redis)
+ await monitor_module.monitor_stats.load_from_redis()
+ monitor_module.monitor_stats.start_persistence_worker()
+
+ # Initialize browser pool
+ await init_permanent(BrowserConfig(
extra_args=config["crawler"]["browser"].get("extra_args", []),
**config["crawler"]["browser"].get("kwargs", {}),
- )) # warm‑up
- app.state.janitor = asyncio.create_task(janitor()) # idle GC
+ ))
+
+ # Start background tasks
+ app.state.janitor = asyncio.create_task(janitor())
+ app.state.timeline_updater = asyncio.create_task(_timeline_updater())
+
yield
+
+ # Cleanup
app.state.janitor.cancel()
+ app.state.timeline_updater.cancel()
+
+ # Monitor cleanup (persist stats and stop workers)
+ from monitor import get_monitor
+ try:
+ await get_monitor().cleanup()
+ except Exception as e:
+ logger.error(f"Monitor cleanup failed: {e}")
+
await close_all()
+async def _timeline_updater():
+ """Update timeline data every 5 seconds."""
+ from monitor import get_monitor
+ while True:
+ await asyncio.sleep(5)
+ try:
+ await asyncio.wait_for(get_monitor().update_timeline(), timeout=4.0)
+ except asyncio.TimeoutError:
+ logger.warning("Timeline update timeout after 4s")
+ except Exception as e:
+ logger.warning(f"Timeline update error: {e}")
+
# ───────────────────── FastAPI instance ──────────────────────
app = FastAPI(
title=config["app"]["title"],
@@ -129,6 +175,25 @@ app.mount(
name="play",
)
+# ── static monitor dashboard ────────────────────────────────
+MONITOR_DIR = pathlib.Path(__file__).parent / "static" / "monitor"
+if not MONITOR_DIR.exists():
+ raise RuntimeError(f"Monitor assets not found at {MONITOR_DIR}")
+app.mount(
+ "/dashboard",
+ StaticFiles(directory=MONITOR_DIR, html=True),
+ name="monitor_ui",
+)
+
+# ── static assets (logo, etc) ────────────────────────────────
+ASSETS_DIR = pathlib.Path(__file__).parent / "static" / "assets"
+if ASSETS_DIR.exists():
+ app.mount(
+ "/static/assets",
+ StaticFiles(directory=ASSETS_DIR),
+ name="assets",
+ )
+
@app.get("/")
async def root():
@@ -212,6 +277,12 @@ def _safe_eval_config(expr: str) -> dict:
# ── job router ──────────────────────────────────────────────
app.include_router(init_job_router(redis, config, token_dep))
+# ── monitor router ──────────────────────────────────────────
+from monitor_routes import router as monitor_router
+app.include_router(monitor_router)
+
+logger = logging.getLogger(__name__)
+
# ──────────────────────── Endpoints ──────────────────────────
@app.post("/token")
async def get_token(req: TokenRequest):
@@ -241,7 +312,8 @@ async def get_markdown(
raise HTTPException(
400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
markdown = await handle_markdown_request(
- body.url, body.f, body.q, body.c, config, body.provider
+ body.url, body.f, body.q, body.c, config, body.provider,
+ body.temperature, body.base_url
)
return JSONResponse({
"url": body.url,
@@ -265,13 +337,20 @@ async def generate_html(
Crawls the URL, preprocesses the raw HTML for schema extraction, and returns the processed HTML.
Use when you need sanitized HTML structures for building schemas or further processing.
"""
+ from crawler_pool import get_crawler
cfg = CrawlerRunConfig()
- async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+ try:
+ crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
- raw_html = results[0].html
- from crawl4ai.utils import preprocess_html_for_schema
- processed_html = preprocess_html_for_schema(raw_html)
- return JSONResponse({"html": processed_html, "url": body.url, "success": True})
+ if not results[0].success:
+ raise HTTPException(500, detail=results[0].error_message or "Crawl failed")
+
+ raw_html = results[0].html
+ from crawl4ai.utils import preprocess_html_for_schema
+ processed_html = preprocess_html_for_schema(raw_html)
+ return JSONResponse({"html": processed_html, "url": body.url, "success": True})
+ except Exception as e:
+ raise HTTPException(500, detail=str(e))
# Screenshot endpoint
@@ -289,18 +368,23 @@ async def generate_screenshot(
Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
Then in result instead of the screenshot you will get a path to the saved file.
"""
- cfg = CrawlerRunConfig(
- screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
- async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+ from crawler_pool import get_crawler
+ try:
+ cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
+ crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
- screenshot_data = results[0].screenshot
- if body.output_path:
- abs_path = os.path.abspath(body.output_path)
- os.makedirs(os.path.dirname(abs_path), exist_ok=True)
- with open(abs_path, "wb") as f:
- f.write(base64.b64decode(screenshot_data))
- return {"success": True, "path": abs_path}
- return {"success": True, "screenshot": screenshot_data}
+ if not results[0].success:
+ raise HTTPException(500, detail=results[0].error_message or "Crawl failed")
+ screenshot_data = results[0].screenshot
+ if body.output_path:
+ abs_path = os.path.abspath(body.output_path)
+ os.makedirs(os.path.dirname(abs_path), exist_ok=True)
+ with open(abs_path, "wb") as f:
+ f.write(base64.b64decode(screenshot_data))
+ return {"success": True, "path": abs_path}
+ return {"success": True, "screenshot": screenshot_data}
+ except Exception as e:
+ raise HTTPException(500, detail=str(e))
# PDF endpoint
@@ -318,17 +402,23 @@ async def generate_pdf(
Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
Then in result instead of the PDF you will get a path to the saved file.
"""
- cfg = CrawlerRunConfig(pdf=True)
- async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+ from crawler_pool import get_crawler
+ try:
+ cfg = CrawlerRunConfig(pdf=True)
+ crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
- pdf_data = results[0].pdf
- if body.output_path:
- abs_path = os.path.abspath(body.output_path)
- os.makedirs(os.path.dirname(abs_path), exist_ok=True)
- with open(abs_path, "wb") as f:
- f.write(pdf_data)
- return {"success": True, "path": abs_path}
- return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
+ if not results[0].success:
+ raise HTTPException(500, detail=results[0].error_message or "Crawl failed")
+ pdf_data = results[0].pdf
+ if body.output_path:
+ abs_path = os.path.abspath(body.output_path)
+ os.makedirs(os.path.dirname(abs_path), exist_ok=True)
+ with open(abs_path, "wb") as f:
+ f.write(pdf_data)
+ return {"success": True, "path": abs_path}
+ return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
+ except Exception as e:
+ raise HTTPException(500, detail=str(e))
@app.post("/execute_js")
@@ -384,12 +474,17 @@ async def execute_js(
```
"""
- cfg = CrawlerRunConfig(js_code=body.scripts)
- async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+ from crawler_pool import get_crawler
+ try:
+ cfg = CrawlerRunConfig(js_code=body.scripts)
+ crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
- # Return JSON-serializable dict of the first CrawlResult
- data = results[0].model_dump()
- return JSONResponse(data)
+ if not results[0].success:
+ raise HTTPException(500, detail=results[0].error_message or "Crawl failed")
+ data = results[0].model_dump()
+ return JSONResponse(data)
+ except Exception as e:
+ raise HTTPException(500, detail=str(e))
@app.get("/llm/{url:path}")
@@ -414,6 +509,72 @@ async def get_schema():
"crawler": CrawlerRunConfig().dump()}
+@app.get("/hooks/info")
+async def get_hooks_info():
+ """Get information about available hook points and their signatures"""
+ from hook_manager import UserHookManager
+
+ hook_info = {}
+ for hook_point, params in UserHookManager.HOOK_SIGNATURES.items():
+ hook_info[hook_point] = {
+ "parameters": params,
+ "description": get_hook_description(hook_point),
+ "example": get_hook_example(hook_point)
+ }
+
+ return JSONResponse({
+ "available_hooks": hook_info,
+ "timeout_limits": {
+ "min": 1,
+ "max": 120,
+ "default": 30
+ }
+ })
+
+
+def get_hook_description(hook_point: str) -> str:
+ """Get description for each hook point"""
+ descriptions = {
+ "on_browser_created": "Called after browser instance is created",
+ "on_page_context_created": "Called after page and context are created - ideal for authentication",
+ "before_goto": "Called before navigating to the target URL",
+ "after_goto": "Called after navigation is complete",
+ "on_user_agent_updated": "Called when user agent is updated",
+ "on_execution_started": "Called when custom JavaScript execution begins",
+ "before_retrieve_html": "Called before retrieving the final HTML - ideal for scrolling",
+ "before_return_html": "Called just before returning the HTML content"
+ }
+ return descriptions.get(hook_point, "")
+
+
+def get_hook_example(hook_point: str) -> str:
+ """Get example code for each hook point"""
+ examples = {
+ "on_page_context_created": """async def hook(page, context, **kwargs):
+ # Add authentication cookie
+ await context.add_cookies([{
+ 'name': 'session',
+ 'value': 'my-session-id',
+ 'domain': '.example.com'
+ }])
+ return page""",
+
+ "before_retrieve_html": """async def hook(page, context, **kwargs):
+ # Scroll to load lazy content
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+ await page.wait_for_timeout(2000)
+ return page""",
+
+ "before_goto": """async def hook(page, context, url, **kwargs):
+ # Set custom headers
+ await page.set_extra_http_headers({
+ 'X-Custom-Header': 'value'
+ })
+ return page"""
+ }
+ return examples.get(hook_point, "# Implement your hook logic here\nreturn page")
+
+
@app.get(config["observability"]["health_check"]["endpoint"])
async def health():
return {"status": "ok", "timestamp": time.time(), "version": __version__}
@@ -429,46 +590,86 @@ async def metrics():
@mcp_tool("crawl")
async def crawl(
request: Request,
- crawl_request: CrawlRequest,
+ crawl_request: CrawlRequestWithHooks,
_td: Dict = Depends(token_dep),
):
"""
Crawl a list of URLs and return the results as JSON.
+ For streaming responses, use /crawl/stream endpoint.
+ Supports optional user-provided hook functions for customization.
"""
if not crawl_request.urls:
raise HTTPException(400, "At least one URL required")
- res = await handle_crawl_request(
+ # Check whether it is a redirection for a streaming request
+ crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
+ if crawler_config.stream:
+ return await stream_process(crawl_request=crawl_request)
+
+ # Prepare hooks config if provided
+ hooks_config = None
+ if crawl_request.hooks:
+ hooks_config = {
+ 'code': crawl_request.hooks.code,
+ 'timeout': crawl_request.hooks.timeout
+ }
+
+ results = await handle_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config,
+ hooks_config=hooks_config
)
- return JSONResponse(res)
+ # check if all of the results are not successful
+ if all(not result["success"] for result in results["results"]):
+ raise HTTPException(500, f"Crawl request failed: {results['results'][0]['error_message']}")
+ return JSONResponse(results)
@app.post("/crawl/stream")
@limiter.limit(config["rate_limiting"]["default_limit"])
async def crawl_stream(
request: Request,
- crawl_request: CrawlRequest,
+ crawl_request: CrawlRequestWithHooks,
_td: Dict = Depends(token_dep),
):
if not crawl_request.urls:
raise HTTPException(400, "At least one URL required")
- crawler, gen = await handle_stream_crawl_request(
+
+ return await stream_process(crawl_request=crawl_request)
+
+async def stream_process(crawl_request: CrawlRequestWithHooks):
+
+ # Prepare hooks config if provided# Prepare hooks config if provided
+ hooks_config = None
+ if crawl_request.hooks:
+ hooks_config = {
+ 'code': crawl_request.hooks.code,
+ 'timeout': crawl_request.hooks.timeout
+ }
+
+ crawler, gen, hooks_info = await handle_stream_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config,
+ hooks_config=hooks_config
)
+
+ # Add hooks info to response headers if available
+ headers = {
+ "Cache-Control": "no-cache",
+ "Connection": "keep-alive",
+ "X-Stream-Status": "active",
+ }
+ if hooks_info:
+ import json
+ headers["X-Hooks-Status"] = json.dumps(hooks_info['status']['status'])
+
return StreamingResponse(
stream_results(crawler, gen),
media_type="application/x-ndjson",
- headers={
- "Cache-Control": "no-cache",
- "Connection": "keep-alive",
- "X-Stream-Status": "active",
- },
+ headers=headers,
)
diff --git a/deploy/docker/static/assets/crawl4ai-logo.jpg b/deploy/docker/static/assets/crawl4ai-logo.jpg
new file mode 100644
index 00000000..6a808c04
Binary files /dev/null and b/deploy/docker/static/assets/crawl4ai-logo.jpg differ
diff --git a/deploy/docker/static/assets/crawl4ai-logo.png b/deploy/docker/static/assets/crawl4ai-logo.png
new file mode 100644
index 00000000..ed82a3cc
Binary files /dev/null and b/deploy/docker/static/assets/crawl4ai-logo.png differ
diff --git a/deploy/docker/static/assets/logo.png b/deploy/docker/static/assets/logo.png
new file mode 100644
index 00000000..25911853
Binary files /dev/null and b/deploy/docker/static/assets/logo.png differ
diff --git a/deploy/docker/static/monitor/index.html b/deploy/docker/static/monitor/index.html
new file mode 100644
index 00000000..a9f8ed39
--- /dev/null
+++ b/deploy/docker/static/monitor/index.html
@@ -0,0 +1,1070 @@
+
+
+
+
+
+ Crawl4AI Monitor
+
+
+
+
+
+
+
+
+
+