Merge branch 'next' into 2025-MAR-ALPHA-1

2025-04-17 10:50:02 +05:30
parent dcc265458c 94d486579c
commit eed7f88f29
38 changed files with 5574 additions and 878 deletions
--- a/47
+++ b/47
@@ -24,7 +24,7 @@ ARG TARGETARCH
 LABEL maintainer="unclecode"
 LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
-LABEL version="1.0"    
+LABEL version="1.0"
 RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
@@ -38,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    libjpeg-dev \
    redis-server \
    supervisor \
    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/*
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -62,11 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    libcairo2 \
    libasound2 \
    libatspi2.0-0 \
    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/*
 RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
    apt-get update && apt-get install -y --no-install-recommends \
    nvidia-cuda-toolkit \
    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/* ; \
 else \
    echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
@@ -76,16 +79,24 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
    echo "🦾 Installing ARM-specific optimizations"; \
    apt-get update && apt-get install -y --no-install-recommends \
    libopenblas-dev \
    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/*; \
 elif [ "$TARGETARCH" = "amd64" ]; then \
    echo "🖥️ Installing AMD64-specific optimizations"; \
    apt-get update && apt-get install -y --no-install-recommends \
    libomp-dev \
    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/*; \
 else \
    echo "Skipping platform-specific optimizations (unsupported platform)"; \
 fi
 # Create a non-root user and group
 RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser
 # Create and set permissions for appuser home directory
 RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
 WORKDIR ${APP_HOME}
 RUN echo '#!/bin/bash\n\
@@ -103,6 +114,7 @@ fi' > /tmp/install.sh && chmod +x /tmp/install.sh
 COPY . /tmp/project/
 # Copy supervisor config first (might need root later, but okay for now)
 COPY deploy/docker/supervisord.conf .
 COPY deploy/docker/requirements.txt .
@@ -131,16 +143,31 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
    else \
        pip install "/tmp/project" ; \
    fi
-    
+
 RUN pip install --no-cache-dir --upgrade pip && \
    /tmp/install.sh && \
    python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
    python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
 RUN playwright install --with-deps chromium
 RUN crawl4ai-setup
 RUN playwright install --with-deps
 RUN mkdir -p /home/appuser/.cache/ms-playwright \
    && cp -r /root/.cache/ms-playwright/chromium-* /home/appuser/.cache/ms-playwright/ \
    && chown -R appuser:appuser /home/appuser/.cache/ms-playwright
 RUN crawl4ai-doctor
 # Copy application code
 COPY deploy/docker/* ${APP_HOME}/
 # Change ownership of the application directory to the non-root user
 RUN chown -R appuser:appuser ${APP_HOME}
 # give permissions to redis persistence dirs if used
 RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD bash -c '\
    MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
@@ -149,8 +176,14 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
        exit 1; \
    fi && \
    redis-cli ping > /dev/null && \
-    curl -f http://localhost:8000/health || exit 1'
+    curl -f http://localhost:11235/health || exit 1'
 EXPOSE 6379
-CMD ["supervisord", "-c", "supervisord.conf"]
+# Switch to the non-root user before starting the application
-    
+USER appuser
 # Set environment variables to ptoduction
 ENV PYTHON_ENV=production 
 # Start the application using supervisord
 CMD ["supervisord", "-c", "supervisord.conf"]
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -2,7 +2,7 @@
 import warnings
 from .async_webcrawler import AsyncWebCrawler, CacheMode
-from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig
+from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig
 from .content_scraping_strategy import (
    ContentScrapingStrategy,
@@ -121,6 +121,7 @@ __all__ = [
    "Crawl4aiDockerClient",
    "ProxyRotationStrategy",
    "RoundRobinProxyStrategy",
    "ProxyConfig"
 ]
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -5,6 +5,7 @@ from .config import (
    MIN_WORD_THRESHOLD,
    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
    PROVIDER_MODELS,
    PROVIDER_MODELS_PREFIXES,
    SCREENSHOT_HEIGHT_TRESHOLD,
    PAGE_TIMEOUT,
    IMAGE_SCORE_THRESHOLD,
@@ -27,11 +28,8 @@ import inspect
 from typing import Any, Dict, Optional
 from enum import Enum
-from .proxy_strategy import ProxyConfig
+# from .proxy_strategy import ProxyConfig
-try:
+
    from .browser.models import DockerConfig
 except ImportError:
    DockerConfig = None
 def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
@@ -122,23 +120,25 @@ def from_serializable_dict(data: Any) -> Any:
    # Handle typed data
    if isinstance(data, dict) and "type" in data:
        # Handle plain dictionaries
-        if data["type"] == "dict":
+        if data["type"] == "dict" and "value" in data:
            return {k: from_serializable_dict(v) for k, v in data["value"].items()}
        # Import from crawl4ai for class instances
        import crawl4ai
-        cls = getattr(crawl4ai, data["type"])
+        if hasattr(crawl4ai, data["type"]):
            cls = getattr(crawl4ai, data["type"])
-        # Handle Enum
+            # Handle Enum
-        if issubclass(cls, Enum):
+            if issubclass(cls, Enum):
-            return cls(data["params"])
+                return cls(data["params"])
-        # Handle class instances
+            if "params" in data:
-        constructor_args = {
+                # Handle class instances
-            k: from_serializable_dict(v) for k, v in data["params"].items()
+                constructor_args = {
-        }
+                    k: from_serializable_dict(v) for k, v in data["params"].items()
-        return cls(**constructor_args)
+                }
                return cls(**constructor_args)
    # Handle lists
    if isinstance(data, list):
@@ -159,6 +159,117 @@ def is_empty_value(value: Any) -> bool:
        return True
    return False
 class ProxyConfig:
    def __init__(
        self,
        server: str,
        username: Optional[str] = None,
        password: Optional[str] = None,
        ip: Optional[str] = None,
    ):
        """Configuration class for a single proxy.
        Args:
            server: Proxy server URL (e.g., "http://127.0.0.1:8080")
            username: Optional username for proxy authentication
            password: Optional password for proxy authentication
            ip: Optional IP address for verification purposes
        """
        self.server = server
        self.username = username
        self.password = password
        # Extract IP from server if not explicitly provided
        self.ip = ip or self._extract_ip_from_server()
    def _extract_ip_from_server(self) -> Optional[str]:
        """Extract IP address from server URL."""
        try:
            # Simple extraction assuming http://ip:port format
            if "://" in self.server:
                parts = self.server.split("://")[1].split(":")
                return parts[0]
            else:
                parts = self.server.split(":")
                return parts[0]
        except Exception:
            return None
    @staticmethod
    def from_string(proxy_str: str) -> "ProxyConfig":
        """Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
        parts = proxy_str.split(":")
        if len(parts) == 4:  # ip:port:username:password
            ip, port, username, password = parts
            return ProxyConfig(
                server=f"http://{ip}:{port}",
                username=username,
                password=password,
                ip=ip
            )
        elif len(parts) == 2:  # ip:port only
            ip, port = parts
            return ProxyConfig(
                server=f"http://{ip}:{port}",
                ip=ip
            )
        else:
            raise ValueError(f"Invalid proxy string format: {proxy_str}")
    @staticmethod
    def from_dict(proxy_dict: Dict) -> "ProxyConfig":
        """Create a ProxyConfig from a dictionary."""
        return ProxyConfig(
            server=proxy_dict.get("server"),
            username=proxy_dict.get("username"),
            password=proxy_dict.get("password"),
            ip=proxy_dict.get("ip")
        )
    @staticmethod
    def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
        """Load proxies from environment variable.
        Args:
            env_var: Name of environment variable containing comma-separated proxy strings
        Returns:
            List of ProxyConfig objects
        """
        proxies = []
        try:
            proxy_list = os.getenv(env_var, "").split(",")
            for proxy in proxy_list:
                if not proxy:
                    continue
                proxies.append(ProxyConfig.from_string(proxy))
        except Exception as e:
            print(f"Error loading proxies from environment: {e}")
        return proxies
    def to_dict(self) -> Dict:
        """Convert to dictionary representation."""
        return {
            "server": self.server,
            "username": self.username,
            "password": self.password,
            "ip": self.ip
        }
    def clone(self, **kwargs) -> "ProxyConfig":
        """Create a copy of this configuration with updated values.
        Args:
            **kwargs: Key-value pairs of configuration options to update
        Returns:
            ProxyConfig: A new instance with the specified updates
        """
        config_dict = self.to_dict()
        config_dict.update(kwargs)
        return ProxyConfig.from_dict(config_dict)
 class BrowserConfig:
    """
@@ -195,8 +306,6 @@ class BrowserConfig:
                             Default: None.
        proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                     If None, no additional proxy config. Default: None.
        docker_config (DockerConfig or dict or None): Configuration for Docker-based browser automation.
                                     Contains settings for Docker container operation. Default: None.
        viewport_width (int): Default viewport width for pages. Default: 1080.
        viewport_height (int): Default viewport height for pages. Default: 600.
        viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
@@ -242,7 +351,6 @@ class BrowserConfig:
        channel: str = "chromium",
        proxy: str = None,
        proxy_config: Union[ProxyConfig, dict, None] = None,
        docker_config: Union[DockerConfig, dict, None] = None,
        viewport_width: int = 1080,
        viewport_height: int = 600,
        viewport: dict = None,
@@ -283,15 +391,7 @@ class BrowserConfig:
            self.chrome_channel = ""
        self.proxy = proxy
        self.proxy_config = proxy_config
        # Handle docker configuration
        if isinstance(docker_config, dict) and DockerConfig is not None:
            self.docker_config = DockerConfig.from_kwargs(docker_config)
        else:
            self.docker_config = docker_config
        if self.docker_config:
            self.user_data_dir = self.docker_config.user_data_dir
        self.viewport_width = viewport_width
        self.viewport_height = viewport_height
@@ -362,7 +462,6 @@ class BrowserConfig:
            channel=kwargs.get("channel", "chromium"),
            proxy=kwargs.get("proxy"),
            proxy_config=kwargs.get("proxy_config", None),
            docker_config=kwargs.get("docker_config", None),
            viewport_width=kwargs.get("viewport_width", 1080),
            viewport_height=kwargs.get("viewport_height", 600),
            accept_downloads=kwargs.get("accept_downloads", False),
@@ -419,13 +518,7 @@ class BrowserConfig:
            "debugging_port": self.debugging_port,
            "host": self.host,
        }
-        
+
        # Include docker_config if it exists
        if hasattr(self, "docker_config") and self.docker_config is not None:
            if hasattr(self.docker_config, "to_dict"):
                result["docker_config"] = self.docker_config.to_dict()
            else:
                result["docker_config"] = self.docker_config
        return result
@@ -1178,9 +1271,18 @@ class LLMConfig:
        elif api_token and api_token.startswith("env:"):
            self.api_token = os.getenv(api_token[4:])
        else:
-            self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv(
+            # Check if given provider starts with any of key in PROVIDER_MODELS_PREFIXES
-                DEFAULT_PROVIDER_API_KEY
+            # If not, check if it is in PROVIDER_MODELS
-            )
+            prefixes = PROVIDER_MODELS_PREFIXES.keys()
            if any(provider.startswith(prefix) for prefix in prefixes):
                selected_prefix = next(
                    (prefix for prefix in prefixes if provider.startswith(prefix)),
                    None,
                )
                self.api_token = PROVIDER_MODELS_PREFIXES.get(selected_prefix)                    
            else:
                self.provider = DEFAULT_PROVIDER
                self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
        self.base_url = base_url
        self.temprature = temprature
        self.max_tokens = max_tokens
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -36,7 +36,7 @@ from .markdown_generation_strategy import (
 )
 from .deep_crawling import DeepCrawlDecorator
 from .async_logger import AsyncLogger, AsyncLoggerBase
-from .async_configs import BrowserConfig, CrawlerRunConfig
+from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig
 from .async_dispatcher import *  # noqa: F403
 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
@@ -291,12 +291,12 @@ class AsyncWebCrawler:
                # Update proxy configuration from rotation strategy if available
                if config and config.proxy_rotation_strategy:
-                    next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
+                    next_proxy : ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
                    if next_proxy:
                        self.logger.info(
                            message="Switch proxy: {proxy}",
                            tag="PROXY",
-                            params={"proxy": next_proxy.server},
+                            params={"proxy": next_proxy.server} 
                        )
                        config.proxy_config = next_proxy
                        # config = config.clone(proxy_config=next_proxy)
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -94,6 +94,7 @@ class ManagedBrowser:
        host: str = "localhost",
        debugging_port: int = 9222,
        cdp_url: Optional[str] = None, 
        browser_config: Optional[BrowserConfig] = None,
    ):
        """
        Initialize the ManagedBrowser instance.
@@ -109,17 +110,19 @@ class ManagedBrowser:
            host (str): Host for debugging the browser. Default: "localhost".
            debugging_port (int): Port for debugging the browser. Default: 9222.
            cdp_url (str or None): CDP URL to connect to the browser. Default: None.
            browser_config (BrowserConfig): Configuration object containing all browser settings. Default: None.
        """
-        self.browser_type = browser_type
+        self.browser_type = browser_config.browser_type
-        self.user_data_dir = user_data_dir
+        self.user_data_dir = browser_config.user_data_dir
-        self.headless = headless
+        self.headless = browser_config.headless
        self.browser_process = None
        self.temp_dir = None
-        self.debugging_port = debugging_port
+        self.debugging_port = browser_config.debugging_port
-        self.host = host
+        self.host = browser_config.host
        self.logger = logger
        self.shutting_down = False
-        self.cdp_url = cdp_url
+        self.cdp_url = browser_config.cdp_url
        self.browser_config = browser_config
    async def start(self) -> str:
        """
@@ -142,6 +145,9 @@ class ManagedBrowser:
        # Get browser path and args based on OS and browser type
        # browser_path = self._get_browser_path()
        args = await self._get_browser_args()
        if self.browser_config.extra_args:
            args.extend(self.browser_config.extra_args)
        # Start browser process
        try:
@@ -477,6 +483,7 @@ class BrowserManager:
                logger=self.logger,
                debugging_port=self.config.debugging_port,
                cdp_url=self.config.cdp_url,
                browser_config=self.config,
            )
    async def start(self):
@@ -491,10 +498,12 @@ class BrowserManager:
        Note: This method should be called in a separate task to avoid blocking the main event loop.
        """
-        if self.playwright is None:
+        if self.playwright is not None:
-            from playwright.async_api import async_playwright
+            await self.close()
        from playwright.async_api import async_playwright
-            self.playwright = await async_playwright().start()
+        self.playwright = await async_playwright().start()
        if self.config.cdp_url or self.config.use_managed_browser:
            self.config.use_managed_browser = True
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -29,6 +29,14 @@ PROVIDER_MODELS = {
    'gemini/gemini-2.0-flash-lite-preview-02-05': os.getenv("GEMINI_API_KEY"),
    "deepseek/deepseek-chat": os.getenv("DEEPSEEK_API_KEY"),
 }
 PROVIDER_MODELS_PREFIXES = {
    "ollama": "no-token-needed",  # Any model from Ollama no need for API token
    "groq": os.getenv("GROQ_API_KEY"),
    "openai": os.getenv("OPENAI_API_KEY"),
    "anthropic": os.getenv("ANTHROPIC_API_KEY"),
    "gemini": os.getenv("GEMINI_API_KEY"),
    "deepseek": os.getenv("DEEPSEEK_API_KEY"),
 }
 # Chunk token threshold
 CHUNK_TOKEN_THRESHOLD = 2**11  # 2048 tokens
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -7,7 +7,9 @@ import time
 from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
 from .config import (
-    DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD,
+    DEFAULT_PROVIDER,
    DEFAULT_PROVIDER_API_KEY,
    CHUNK_TOKEN_THRESHOLD,
    OVERLAP_RATE,
    WORD_TOKEN_RATE,
 )
@@ -542,6 +544,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
        """
        super().__init__( input_format=input_format, **kwargs)
        self.llm_config = llm_config
        if not self.llm_config:
            self.llm_config = create_llm_config(
                provider=DEFAULT_PROVIDER,
                api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY),
            )
        self.instruction = instruction
        self.extract_type = extraction_type
        self.schema = schema
--- a/crawl4ai/install.py
+++ b/crawl4ai/install.py
@@ -40,10 +40,25 @@ def setup_home_directory():
            f.write("")
 def post_install():
-    """Run all post-installation tasks"""
+    """
    Run all post-installation tasks.
    Checks CRAWL4AI_MODE environment variable. If set to 'api',
    skips Playwright browser installation.
    """
    logger.info("Running post-installation setup...", tag="INIT")
    setup_home_directory()
-    install_playwright()
+
    # Check environment variable to conditionally skip Playwright install
    run_mode = os.getenv('CRAWL4AI_MODE')
    if run_mode == 'api':
        logger.warning(
            "CRAWL4AI_MODE=api detected. Skipping Playwright browser installation.",
            tag="SETUP"
        )
    else:
        # Proceed with installation only if mode is not 'api'
        install_playwright()
    run_migration()
    # TODO: Will be added in the future
    # setup_builtin_browser()
--- a/crawl4ai/proxy_strategy.py
+++ b/crawl4ai/proxy_strategy.py
@@ -4,6 +4,9 @@ from itertools import cycle
 import os
 ########### ATTENTION PEOPLE OF EARTH ###########
 # I have moved this config to async_configs.py, kept it here, in case someone still importing it, however
 # be a dear and follow `from crawl4ai import ProxyConfig` instead :)
 class ProxyConfig:
    def __init__(
        self,
@@ -119,12 +122,12 @@ class ProxyRotationStrategy(ABC):
    """Base abstract class for proxy rotation strategies"""
    @abstractmethod
-    async def get_next_proxy(self) -> Optional[Dict]:
+    async def get_next_proxy(self) -> Optional[ProxyConfig]:
        """Get next proxy configuration from the strategy"""
        pass
    @abstractmethod
-    def add_proxies(self, proxies: List[Dict]):
+    def add_proxies(self, proxies: List[ProxyConfig]):
        """Add proxy configurations to the strategy"""
        pass
--- a/crawl4ai/ssl_certificate.py
+++ b/crawl4ai/ssl_certificate.py
@@ -9,83 +9,44 @@ from urllib.parse import urlparse
 import OpenSSL.crypto
 from pathlib import Path
-
+# === Inherit from dict ===
-class SSLCertificate:
+class SSLCertificate(dict):
    """
-    A class representing an SSL certificate with methods to export in various formats.
+    A class representing an SSL certificate, behaving like a dictionary
    for direct JSON serialization. It stores the certificate information internally
    and provides methods for export and property access.
-    Attributes:
+    Inherits from dict, so instances are directly JSON serializable.
        cert_info (Dict[str, Any]): The certificate information.
        Methods:
            from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
            from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
            from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
            export_as_pem() -> str: Export the certificate as PEM format.
            export_as_der() -> bytes: Export the certificate as DER format.
            export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
            export_as_text() -> str: Export the certificate as text format.
    """
    # Use __slots__ for potential memory optimization if desired, though less common when inheriting dict
    # __slots__ = ("_cert_info",) # If using slots, be careful with dict inheritance interaction
    def __init__(self, cert_info: Dict[str, Any]):
        self._cert_info = self._decode_cert_data(cert_info)
    @staticmethod
    def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
        """
-        Create SSLCertificate instance from a URL.
+        Initializes the SSLCertificate object.
        Args:
-            url (str): URL of the website.
+            cert_info (Dict[str, Any]): The raw certificate dictionary.
            timeout (int): Timeout for the connection (default: 10).
        Returns:
            Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
        """
-        try:
+        # 1. Decode the data (handle bytes -> str)
-            hostname = urlparse(url).netloc
+        decoded_info = self._decode_cert_data(cert_info)
            if ":" in hostname:
                hostname = hostname.split(":")[0]
-            context = ssl.create_default_context()
+        # 2. Store the decoded info internally (optional but good practice)
-            with socket.create_connection((hostname, 443), timeout=timeout) as sock:
+        # self._cert_info = decoded_info # You can keep this if methods rely on it
                with context.wrap_socket(sock, server_hostname=hostname) as ssock:
                    cert_binary = ssock.getpeercert(binary_form=True)
                    x509 = OpenSSL.crypto.load_certificate(
                        OpenSSL.crypto.FILETYPE_ASN1, cert_binary
                    )
-                    cert_info = {
+        # 3. Initialize the dictionary part of the object with the decoded data
-                        "subject": dict(x509.get_subject().get_components()),
+        super().__init__(decoded_info)
                        "issuer": dict(x509.get_issuer().get_components()),
                        "version": x509.get_version(),
                        "serial_number": hex(x509.get_serial_number()),
                        "not_before": x509.get_notBefore(),
                        "not_after": x509.get_notAfter(),
                        "fingerprint": x509.digest("sha256").hex(),
                        "signature_algorithm": x509.get_signature_algorithm(),
                        "raw_cert": base64.b64encode(cert_binary),
                    }
                    # Add extensions
                    extensions = []
                    for i in range(x509.get_extension_count()):
                        ext = x509.get_extension(i)
                        extensions.append(
                            {"name": ext.get_short_name(), "value": str(ext)}
                        )
                    cert_info["extensions"] = extensions
                    return SSLCertificate(cert_info)
        except Exception:
            return None
    @staticmethod
    def _decode_cert_data(data: Any) -> Any:
        """Helper method to decode bytes in certificate data."""
        if isinstance(data, bytes):
-            return data.decode("utf-8")
+            try:
                # Try UTF-8 first, fallback to latin-1 for arbitrary bytes
                return data.decode("utf-8")
            except UnicodeDecodeError:
                return data.decode("latin-1") # Or handle as needed, maybe hex representation
        elif isinstance(data, dict):
            return {
                (
@@ -97,36 +58,119 @@ class SSLCertificate:
            return [SSLCertificate._decode_cert_data(item) for item in data]
        return data
    @staticmethod
    def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
        """
        Create SSLCertificate instance from a URL. Fetches cert info and initializes.
        (Fetching logic remains the same)
        """
        cert_info_raw = None # Variable to hold the fetched dict
        try:
            hostname = urlparse(url).netloc
            if ":" in hostname:
                hostname = hostname.split(":")[0]
            context = ssl.create_default_context()
            # Set check_hostname to False and verify_mode to CERT_NONE temporarily
            # for potentially problematic certificates during fetch, but parse the result regardless.
            # context.check_hostname = False
            # context.verify_mode = ssl.CERT_NONE
            with socket.create_connection((hostname, 443), timeout=timeout) as sock:
                with context.wrap_socket(sock, server_hostname=hostname) as ssock:
                    cert_binary = ssock.getpeercert(binary_form=True)
                    if not cert_binary:
                         print(f"Warning: No certificate returned for {hostname}")
                         return None
                    x509 = OpenSSL.crypto.load_certificate(
                        OpenSSL.crypto.FILETYPE_ASN1, cert_binary
                    )
                    # Create the dictionary directly
                    cert_info_raw = {
                        "subject": dict(x509.get_subject().get_components()),
                        "issuer": dict(x509.get_issuer().get_components()),
                        "version": x509.get_version(),
                        "serial_number": hex(x509.get_serial_number()),
                        "not_before": x509.get_notBefore(), # Keep as bytes initially, _decode handles it
                        "not_after": x509.get_notAfter(),   # Keep as bytes initially
                        "fingerprint": x509.digest("sha256").hex(), # hex() is already string
                        "signature_algorithm": x509.get_signature_algorithm(), # Keep as bytes
                        "raw_cert": base64.b64encode(cert_binary), # Base64 is bytes, _decode handles it
                    }
                    # Add extensions
                    extensions = []
                    for i in range(x509.get_extension_count()):
                        ext = x509.get_extension(i)
                        # get_short_name() returns bytes, str(ext) handles value conversion
                        extensions.append(
                            {"name": ext.get_short_name(), "value": str(ext)}
                        )
                    cert_info_raw["extensions"] = extensions
        except ssl.SSLCertVerificationError as e:
             print(f"SSL Verification Error for {url}: {e}")
             # Decide if you want to proceed or return None based on your needs
             # You might try fetching without verification here if needed, but be cautious.
             return None
        except socket.gaierror:
            print(f"Could not resolve hostname: {hostname}")
            return None
        except socket.timeout:
            print(f"Connection timed out for {url}")
            return None
        except Exception as e:
            print(f"Error fetching/processing certificate for {url}: {e}")
            # Log the full error details if needed: logging.exception("Cert fetch error")
            return None
        # If successful, create the SSLCertificate instance from the dictionary
        if cert_info_raw:
             return SSLCertificate(cert_info_raw)
        else:
             return None
    # --- Properties now access the dictionary items directly via self[] ---
    @property
    def issuer(self) -> Dict[str, str]:
        return self.get("issuer", {}) # Use self.get for safety
    @property
    def subject(self) -> Dict[str, str]:
        return self.get("subject", {})
    @property
    def valid_from(self) -> str:
        return self.get("not_before", "")
    @property
    def valid_until(self) -> str:
        return self.get("not_after", "")
    @property
    def fingerprint(self) -> str:
        return self.get("fingerprint", "")
    # --- Export methods can use `self` directly as it is the dict ---
    def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
-        """
+        """Export certificate as JSON."""
-        Export certificate as JSON.
+        # `self` is already the dictionary we want to serialize
-
+        json_str = json.dumps(self, indent=2, ensure_ascii=False)
        Args:
            filepath (Optional[str]): Path to save the JSON file (default: None).
        Returns:
            Optional[str]: JSON string if successful, None otherwise.
        """
        json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
        if filepath:
            Path(filepath).write_text(json_str, encoding="utf-8")
            return None
        return json_str
    def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
-        """
+        """Export certificate as PEM."""
        Export certificate as PEM.
        Args:
            filepath (Optional[str]): Path to save the PEM file (default: None).
        Returns:
            Optional[str]: PEM string if successful, None otherwise.
        """
        try:
            # Decode the raw_cert (which should be string due to _decode)
            raw_cert_bytes = base64.b64decode(self.get("raw_cert", ""))
            x509 = OpenSSL.crypto.load_certificate(
-                OpenSSL.crypto.FILETYPE_ASN1,
+                OpenSSL.crypto.FILETYPE_ASN1, raw_cert_bytes
                base64.b64decode(self._cert_info["raw_cert"]),
            )
            pem_data = OpenSSL.crypto.dump_certificate(
                OpenSSL.crypto.FILETYPE_PEM, x509
@@ -136,49 +180,25 @@ class SSLCertificate:
                Path(filepath).write_text(pem_data, encoding="utf-8")
                return None
            return pem_data
-        except Exception:
+        except Exception as e:
-            return None
+             print(f"Error converting to PEM: {e}")
             return None
    def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
-        """
+        """Export certificate as DER."""
        Export certificate as DER.
        Args:
            filepath (Optional[str]): Path to save the DER file (default: None).
        Returns:
            Optional[bytes]: DER bytes if successful, None otherwise.
        """
        try:
-            der_data = base64.b64decode(self._cert_info["raw_cert"])
+            # Decode the raw_cert (which should be string due to _decode)
            der_data = base64.b64decode(self.get("raw_cert", ""))
            if filepath:
                Path(filepath).write_bytes(der_data)
                return None
            return der_data
-        except Exception:
+        except Exception as e:
-            return None
+             print(f"Error converting to DER: {e}")
             return None
-    @property
+    # Optional: Add __repr__ for better debugging
-    def issuer(self) -> Dict[str, str]:
+    def __repr__(self) -> str:
-        """Get certificate issuer information."""
+        subject_cn = self.subject.get('CN', 'N/A')
-        return self._cert_info.get("issuer", {})
+        issuer_cn = self.issuer.get('CN', 'N/A')
-
+        return f"<SSLCertificate Subject='{subject_cn}' Issuer='{issuer_cn}'>"
    @property
    def subject(self) -> Dict[str, str]:
        """Get certificate subject information."""
        return self._cert_info.get("subject", {})
    @property
    def valid_from(self) -> str:
        """Get certificate validity start date."""
        return self._cert_info.get("not_before", "")
    @property
    def valid_until(self) -> str:
        """Get certificate validity end date."""
        return self._cert_info.get("not_after", "")
    @property
    def fingerprint(self) -> str:
        """Get certificate fingerprint."""
        return self._cert_info.get("fingerprint", "")
--- a/deploy/docker/README-new.md
+++ b/deploy/docker/README-new.md
@@ -0,0 +1,644 @@
 # Crawl4AI Docker Guide 🐳
 ## Table of Contents
 - [Prerequisites](#prerequisites)
 - [Installation](#installation)
  - [Option 1: Using Docker Compose (Recommended)](#option-1-using-docker-compose-recommended)
  - [Option 2: Manual Local Build & Run](#option-2-manual-local-build--run)
  - [Option 3: Using Pre-built Docker Hub Images](#option-3-using-pre-built-docker-hub-images)
 - [Dockerfile Parameters](#dockerfile-parameters)
 - [Using the API](#using-the-api)
  - [Understanding Request Schema](#understanding-request-schema)
  - [REST API Examples](#rest-api-examples)
  - [Python SDK](#python-sdk)
 - [Metrics & Monitoring](#metrics--monitoring)
 - [Deployment Scenarios](#deployment-scenarios)
 - [Complete Examples](#complete-examples)
 - [Server Configuration](#server-configuration)
  - [Understanding config.yml](#understanding-configyml)
  - [JWT Authentication](#jwt-authentication)
  - [Configuration Tips and Best Practices](#configuration-tips-and-best-practices)
  - [Customizing Your Configuration](#customizing-your-configuration)
  - [Configuration Recommendations](#configuration-recommendations)
 - [Getting Help](#getting-help)
 ## Prerequisites
 Before we dive in, make sure you have:
 - Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop).
 - `git` for cloning the repository.
 - At least 4GB of RAM available for the container (more recommended for heavy use).
 - Python 3.10+ (if using the Python SDK).
 - Node.js 16+ (if using the Node.js examples).
 > 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
 ## Installation
 We offer several ways to get the Crawl4AI server running. Docker Compose is the easiest way to manage local builds and runs.
 ### Option 1: Using Docker Compose (Recommended)
 Docker Compose simplifies building and running the service, especially for local development and testing across different platforms.
 #### 1. Clone Repository
 ```bash
 git clone https://github.com/unclecode/crawl4ai.git
 cd crawl4ai
 ```
 #### 2. Environment Setup (API Keys)
 If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
 ```bash
 # Make sure you are in the 'crawl4ai' root directory
 cp deploy/docker/.llm.env.example .llm.env
 # Now edit .llm.env and add your API keys
 # Example content:
 # OPENAI_API_KEY=sk-your-key
 # ANTHROPIC_API_KEY=your-anthropic-key
 # ...
 ```
 > 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
 #### 3. Build and Run with Compose
 The `docker-compose.yml` file in the project root defines services for different scenarios using **profiles**.
 *   **Build and Run Locally (AMD64):**
    ```bash
    # Builds the image locally using Dockerfile and runs it
    docker compose --profile local-amd64 up --build -d
    ```
 *   **Build and Run Locally (ARM64):**
    ```bash
    # Builds the image locally using Dockerfile and runs it
    docker compose --profile local-arm64 up --build -d
    ```
 *   **Run Pre-built Image from Docker Hub (AMD64):**
    ```bash
    # Pulls and runs the specified AMD64 image from Docker Hub
    # (Set VERSION env var for specific tags, e.g., VERSION=0.5.1-d1)
    docker compose --profile hub-amd64 up -d
    ```
 *   **Run Pre-built Image from Docker Hub (ARM64):**
    ```bash
    # Pulls and runs the specified ARM64 image from Docker Hub
    docker compose --profile hub-arm64 up -d
    ```
 > The server will be available at `http://localhost:11235`.
 #### 4. Stopping Compose Services
 ```bash
 # Stop the service(s) associated with a profile (e.g., local-amd64)
 docker compose --profile local-amd64 down
 ```
 ### Option 2: Manual Local Build & Run
 If you prefer not to use Docker Compose for local builds.
 #### 1. Clone Repository & Setup Environment
 Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
 #### 2. Build the Image (Multi-Arch)
 Use `docker buildx` to build the image. This example builds for multiple platforms and loads the image matching your host architecture into the local Docker daemon.
 ```bash
 # Make sure you are in the 'crawl4ai' root directory
 docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load .
 ```
 #### 3. Run the Container
 *   **Basic run (no LLM support):**
    ```bash
    # Replace --platform if your host is ARM64
    docker run -d \
      -p 11235:11235 \
      --name crawl4ai-standalone \
      --shm-size=1g \
      --platform linux/amd64 \
      crawl4ai-local:latest
    ```
 *   **With LLM support:**
    ```bash
    # Make sure .llm.env is in the current directory (project root)
    # Replace --platform if your host is ARM64
    docker run -d \
      -p 11235:11235 \
      --name crawl4ai-standalone \
      --env-file .llm.env \
      --shm-size=1g \
      --platform linux/amd64 \
      crawl4ai-local:latest
    ```
 > The server will be available at `http://localhost:11235`.
 #### 4. Stopping the Manual Container
 ```bash
 docker stop crawl4ai-standalone && docker rm crawl4ai-standalone
 ```
 ### Option 3: Using Pre-built Docker Hub Images
 Pull and run images directly from Docker Hub without building locally.
 #### 1. Pull the Image
 We use a versioning scheme like `LIBRARY_VERSION-dREVISION` (e.g., `0.5.1-d1`). The `latest` tag points to the most recent stable release. Images are built with multi-arch manifests, so Docker usually pulls the correct version for your system automatically.
 ```bash
 # Pull a specific version (recommended for stability)
 docker pull unclecode/crawl4ai:0.5.1-d1
 # Or pull the latest stable version
 docker pull unclecode/crawl4ai:latest
 ```
 #### 2. Setup Environment (API Keys)
 If using LLMs, create the `.llm.env` file in a directory of your choice, similar to Step 2 in the Compose section.
 #### 3. Run the Container
 *   **Basic run:**
    ```bash
    docker run -d \
      -p 11235:11235 \
      --name crawl4ai-hub \
      --shm-size=1g \
      unclecode/crawl4ai:0.5.1-d1 # Or use :latest
    ```
 *   **With LLM support:**
    ```bash
    # Make sure .llm.env is in the current directory you are running docker from
    docker run -d \
      -p 11235:11235 \
      --name crawl4ai-hub \
      --env-file .llm.env \
      --shm-size=1g \
      unclecode/crawl4ai:0.5.1-d1 # Or use :latest
    ```
 > The server will be available at `http://localhost:11235`.
 #### 4. Stopping the Hub Container
 ```bash
 docker stop crawl4ai-hub && docker rm crawl4ai-hub
 ```
 #### Docker Hub Versioning Explained
 *   **Image Name:** `unclecode/crawl4ai`
 *   **Tag Format:** `LIBRARY_VERSION-dREVISION`
    *   `LIBRARY_VERSION`: The Semantic Version of the core `crawl4ai` Python library included (e.g., `0.5.1`).
    *   `dREVISION`: An incrementing number (starting at `d1`) for Docker build changes made *without* changing the library version (e.g., base image updates, dependency fixes). Resets to `d1` for each new `LIBRARY_VERSION`.
 *   **Example:** `unclecode/crawl4ai:0.5.1-d1`
 *   **`latest` Tag:** Points to the most recent stable `LIBRARY_VERSION-dREVISION`.
 *   **Multi-Arch:** Images support `linux/amd64` and `linux/arm64`. Docker automatically selects the correct architecture.
 ---
 *(Rest of the document remains largely the same, but with key updates below)*
 ---
 ## Dockerfile Parameters
 You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
 ```bash
 # Example: Build with 'all' features using buildx
 docker buildx build \
  --platform linux/amd64,linux/arm64 \
  --build-arg INSTALL_TYPE=all \
  -t yourname/crawl4ai-all:latest \
  --load \
  . # Build from root context
 ```
 ### Build Arguments Explained
 | Argument     | Description                              | Default   | Options                            |
 | :----------- | :--------------------------------------- | :-------- | :--------------------------------- |
 | INSTALL_TYPE | Feature set                              | `default` | `default`, `all`, `torch`, `transformer` |
 | ENABLE_GPU   | GPU support (CUDA for AMD64)           | `false`   | `true`, `false`                    |
 | APP_HOME     | Install path inside container (advanced) | `/app`    | any valid path                   |
 | USE_LOCAL    | Install library from local source        | `true`    | `true`, `false`                    |
 | GITHUB_REPO  | Git repo to clone if USE_LOCAL=false   | *(see Dockerfile)* | any git URL                  |
 | GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false   | `main`    | any branch name                  |
 *(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)*
 ### Build Best Practices
 1.  **Choose the Right Install Type**
    *   `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation.
    *   `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras.
 2.  **Platform Considerations**
    *   Use `buildx` for building multi-architecture images, especially for pushing to registries.
    *   Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds.
 3.  **Performance Optimization**
    *   The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64).
 ---
 ## Using the API
 Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests.
 ### Python SDK
 Install the SDK: `pip install crawl4ai`
 ```python
 import asyncio
 from crawl4ai.docker_client import Crawl4aiDockerClient
 from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
 async def main():
    # Point to the correct server port
    async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
        # If JWT is enabled on the server, authenticate first:
        # await client.authenticate("user@example.com") # See Server Configuration section
        # Example Non-streaming crawl
        print("--- Running Non-Streaming Crawl ---")
        results = await client.crawl(
            ["https://httpbin.org/html"],
            browser_config=BrowserConfig(headless=True), # Use library classes for config aid
            crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        )
        if results: # client.crawl returns None on failure
          print(f"Non-streaming results success: {results.success}")
          if results.success:
              for result in results: # Iterate through the CrawlResultContainer
                  print(f"URL: {result.url}, Success: {result.success}")
        else:
            print("Non-streaming crawl failed.")
        # Example Streaming crawl
        print("\n--- Running Streaming Crawl ---")
        stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
        try:
            async for result in await client.crawl( # client.crawl returns an async generator for streaming
                ["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
                browser_config=BrowserConfig(headless=True),
                crawler_config=stream_config
            ):
                print(f"Streamed result: URL: {result.url}, Success: {result.success}")
        except Exception as e:
            print(f"Streaming crawl failed: {e}")
        # Example Get schema
        print("\n--- Getting Schema ---")
        schema = await client.get_schema()
        print(f"Schema received: {bool(schema)}") # Print whether schema was received
 if __name__ == "__main__":
    asyncio.run(main())
 ```
 *(SDK parameters like timeout, verify_ssl etc. remain the same)*
 ### Second Approach: Direct API Calls
 Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`.
 *(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)*
 #### More Examples *(Ensure Schema example uses type/value wrapper)*
 **Advanced Crawler Configuration**
 *(Keep example, ensure cache_mode uses valid enum value like "bypass")*
 **Extraction Strategy**
 ```json
 {
    "crawler_config": {
        "type": "CrawlerRunConfig",
        "params": {
            "extraction_strategy": {
                "type": "JsonCssExtractionStrategy",
                "params": {
                    "schema": {
                        "type": "dict",
                        "value": {
                           "baseSelector": "article.post",
                           "fields": [
                               {"name": "title", "selector": "h1", "type": "text"},
                               {"name": "content", "selector": ".content", "type": "html"}
                           ]
                         }
                    }
                }
            }
        }
    }
 }
 ```
 **LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
 *(Keep Deep Crawler Example)*
 ### REST API Examples
 Update URLs to use port `11235`.
 #### Simple Crawl
 ```python
 import requests
 # Configuration objects converted to the required JSON structure
 browser_config_payload = {
    "type": "BrowserConfig",
    "params": {"headless": True}
 }
 crawler_config_payload = {
    "type": "CrawlerRunConfig",
    "params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum
 }
 crawl_payload = {
    "urls": ["https://httpbin.org/html"],
    "browser_config": browser_config_payload,
    "crawler_config": crawler_config_payload
 }
 response = requests.post(
    "http://localhost:11235/crawl", # Updated port
    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled
    json=crawl_payload
 )
 print(f"Status Code: {response.status_code}")
 if response.ok:
    print(response.json())
 else:
    print(f"Error: {response.text}")
 ```
 #### Streaming Results
 ```python
 import json
 import httpx # Use httpx for async streaming example
 async def test_stream_crawl(token: str = None): # Made token optional
    """Test the /crawl/stream endpoint with multiple URLs."""
    url = "http://localhost:11235/crawl/stream" # Updated port
    payload = {
        "urls": [
            "https://httpbin.org/html",
            "https://httpbin.org/links/5/0",
        ],
        "browser_config": {
            "type": "BrowserConfig",
            "params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict
        },
        "crawler_config": {
            "type": "CrawlerRunConfig",
            "params": {"stream": True, "cache_mode": "bypass"}
        }
    }
    headers = {}
    # if token:
    #    headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled
    try:
        async with httpx.AsyncClient() as client:
            async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response:
                print(f"Status: {response.status_code} (Expected: 200)")
                response.raise_for_status() # Raise exception for bad status codes
                # Read streaming response line-by-line (NDJSON)
                async for line in response.aiter_lines():
                    if line:
                        try:
                            data = json.loads(line)
                            # Check for completion marker
                            if data.get("status") == "completed":
                                print("Stream completed.")
                                break
                            print(f"Streamed Result: {json.dumps(data, indent=2)}")
                        except json.JSONDecodeError:
                            print(f"Warning: Could not decode JSON line: {line}")
    except httpx.HTTPStatusError as e:
         print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
    except Exception as e:
        print(f"Error in streaming crawl test: {str(e)}")
 # To run this example:
 # import asyncio
 # asyncio.run(test_stream_crawl())
 ```
 ---
 ## Metrics & Monitoring
 Keep an eye on your crawler with these endpoints:
 - `/health` - Quick health check
 - `/metrics` - Detailed Prometheus metrics
 - `/schema` - Full API schema
 Example health check:
 ```bash
 curl http://localhost:11235/health
 ```
 ---
 *(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)*
 ---
 ## Server Configuration
 The server's behavior can be customized through the `config.yml` file.
 ### Understanding config.yml
 The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build.
 Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`):
 ```yaml
 # Application Configuration
 app:
  title: "Crawl4AI API"
  version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
  host: "0.0.0.0"
  port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
  reload: False # Default set to False - suitable for production
  timeout_keep_alive: 300
 # Default LLM Configuration
 llm:
  provider: "openai/gpt-4o-mini"
  api_key_env: "OPENAI_API_KEY"
  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
 # Redis Configuration (Used by internal Redis server managed by supervisord)
 redis:
  host: "localhost"
  port: 6379
  db: 0
  password: ""
  # ... other redis options ...
 # Rate Limiting Configuration
 rate_limiting:
  enabled: True
  default_limit: "1000/minute"
  trusted_proxies: []
  storage_uri: "memory://"  # Use "redis://localhost:6379" if you need persistent/shared limits
 # Security Configuration
 security:
  enabled: false # Master toggle for security features
  jwt_enabled: false # Enable JWT authentication (requires security.enabled=true)
  https_redirect: false # Force HTTPS (requires security.enabled=true)
  trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
  headers: # Security headers (applied if security.enabled=true)
    x_content_type_options: "nosniff"
    x_frame_options: "DENY"
    content_security_policy: "default-src 'self'"
    strict_transport_security: "max-age=63072000; includeSubDomains"
 # Crawler Configuration
 crawler:
  memory_threshold_percent: 95.0
  rate_limiter:
    base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher
  timeouts:
    stream_init: 30.0  # Timeout for stream initialization
    batch_process: 300.0 # Timeout for non-streaming /crawl processing
 # Logging Configuration
 logging:
  level: "INFO"
  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 # Observability Configuration
 observability:
  prometheus:
    enabled: True
    endpoint: "/metrics"
  health_check:
    endpoint: "/health"
 ```
 *(JWT Authentication section remains the same, just note the default port is now 11235 for requests)*
 *(Configuration Tips and Best Practices remain the same)*
 ### Customizing Your Configuration
 You can override the default `config.yml`.
 #### Method 1: Modify Before Build
 1.  Edit the `deploy/docker/config.yml` file in your local repository clone.
 2.  Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image.
 #### Method 2: Runtime Mount (Recommended for Custom Deploys)
 1.  Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections.
 2.  Mount it when running the container:
    *   **Using `docker run`:**
        ```bash
        # Assumes my-custom-config.yml is in the current directory
        docker run -d -p 11235:11235 \
          --name crawl4ai-custom-config \
          --env-file .llm.env \
          --shm-size=1g \
          -v $(pwd)/my-custom-config.yml:/app/config.yml \
          unclecode/crawl4ai:latest # Or your specific tag
        ```
    *   **Using `docker-compose.yml`:** Add a `volumes` section to the service definition:
        ```yaml
        services:
          crawl4ai-hub-amd64: # Or your chosen service
            image: unclecode/crawl4ai:latest
            profiles: ["hub-amd64"]
            <<: *base-config
            volumes:
              # Mount local custom config over the default one in the container
              - ./my-custom-config.yml:/app/config.yml
              # Keep the shared memory volume from base-config
              - /dev/shm:/dev/shm
        ```
        *(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)*
 > 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration.
 ### Configuration Recommendations
 1. **Security First** 🔒
   - Always enable security in production
   - Use specific trusted_hosts instead of wildcards
   - Set up proper rate limiting to protect your server
   - Consider your environment before enabling HTTPS redirect
 2. **Resource Management** 💻
   - Adjust memory_threshold_percent based on available RAM
   - Set timeouts according to your content size and network conditions
   - Use Redis for rate limiting in multi-container setups
 3. **Monitoring** 📊
   - Enable Prometheus if you need metrics
   - Set DEBUG logging in development, INFO in production
   - Regular health check monitoring is crucial
 4. **Performance Tuning** ⚡
   - Start with conservative rate limiter delays
   - Increase batch_process timeout for large content
   - Adjust stream_init timeout based on initial response times
 ## Getting Help
 We're here to help you succeed with Crawl4AI! Here's how to get support:
 - 📖 Check our [full documentation](https://docs.crawl4ai.com)
 - 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
 - 💬 Join our [Discord community](https://discord.gg/crawl4ai)
 - ⭐ Star us on GitHub to show support!
 ## Summary
 In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
 - Building and running the Docker container
 - Configuring the environment
 - Making API requests with proper typing
 - Using the Python SDK
 - Monitoring your deployment
 Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
 Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
 Happy crawling! 🕷️
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -391,21 +391,25 @@ async def handle_crawl_request(
            )
        )
-        async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
-            results = []
+        await crawler.start()
-            func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
+        results = []
-            partial_func = partial(func, 
+        func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
-                                   urls[0] if len(urls) == 1 else urls, 
+        partial_func = partial(func, 
-                                   config=crawler_config, 
+                                urls[0] if len(urls) == 1 else urls, 
-                                   dispatcher=dispatcher)
+                                config=crawler_config, 
-            results = await partial_func()
+                                dispatcher=dispatcher)
-            return {
+        results = await partial_func()
-                "success": True,
+        await crawler.close()
-                "results": [result.model_dump() for result in results]
+        return {
-            }
+            "success": True,
            "results": [result.model_dump() for result in results]
        }
    except Exception as e:
        logger.error(f"Crawl error: {str(e)}", exc_info=True)
        if 'crawler' in locals():
            await crawler.close()
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=str(e)
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -4,7 +4,7 @@ app:
  version: "1.0.0"
  host: "0.0.0.0"
  port: 8020
-  reload: True
+  reload: False
  timeout_keep_alive: 300
 # Default LLM Configuration
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -1,4 +1,3 @@
 crawl4ai
 fastapi
 uvicorn
 gunicorn>=23.0.0
--- a/deploy/docker/supervisord.conf
+++ b/deploy/docker/supervisord.conf
@@ -1,12 +1,28 @@
 [supervisord]
-nodaemon=true
+nodaemon=true                   ; Run supervisord in the foreground
 logfile=/dev/null               ; Log supervisord output to stdout/stderr
 logfile_maxbytes=0
 [program:redis]
-command=redis-server
+command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
 user=appuser                    ; Run redis as our non-root user
 autorestart=true
 priority=10
 stdout_logfile=/dev/stdout      ; Redirect redis stdout to container stdout
 stdout_logfile_maxbytes=0
 stderr_logfile=/dev/stderr      ; Redirect redis stderr to container stderr
 stderr_logfile_maxbytes=0
 [program:gunicorn]
-command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app
+command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 2 --threads 2 --timeout 120 --graceful-timeout 30 --keep-alive 60 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
 directory=/app                  ; Working directory for the app
 user=appuser                    ; Run gunicorn as our non-root user
 autorestart=true
-priority=20
+priority=20
 environment=PYTHONUNBUFFERED=1  ; Ensure Python output is sent straight to logs
 stdout_logfile=/dev/stdout      ; Redirect gunicorn stdout to container stdout
 stdout_logfile_maxbytes=0
 stderr_logfile=/dev/stderr      ; Redirect gunicorn stderr to container stderr
 stderr_logfile_maxbytes=0
 # Optional: Add filebeat or other logging agents here if needed
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,15 +1,30 @@
-# Base configuration (not a service, just a reusable config block)
+# docker-compose.yml
 # Base configuration anchor for reusability
 x-base-config: &base-config
  ports:
    # Map host port 11235 to container port 11235 (where Gunicorn will listen)
    - "11235:11235"
-    - "8000:8000"
+    # - "8080:8080" # Uncomment if needed
-    - "9222:9222"
+
-    - "8080:8080"
+  # Load API keys primarily from .llm.env file
  # Create .llm.env in the root directory .llm.env.example
  env_file:
    - .llm.env
  # Define environment variables, allowing overrides from host environment
  # Syntax ${VAR:-} uses host env var 'VAR' if set, otherwise uses value from .llm.env
  environment:
    - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
    - OPENAI_API_KEY=${OPENAI_API_KEY:-}
-    - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
+    - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
    - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
    - GROQ_API_KEY=${GROQ_API_KEY:-}
    - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
    - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
    - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
  volumes:
    # Mount /dev/shm for Chromium/Playwright performance
    - /dev/shm:/dev/shm
  deploy:
    resources:
@@ -19,47 +34,47 @@ x-base-config: &base-config
        memory: 1G
  restart: unless-stopped
  healthcheck:
    # IMPORTANT: Ensure Gunicorn binds to 11235 in supervisord.conf
    test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
    interval: 30s
    timeout: 10s
    retries: 3
-    start_period: 40s
+    start_period: 40s # Give the server time to start
  # Run the container as the non-root user defined in the Dockerfile
  user: "appuser"
 services:
-  # Local build services for different platforms
+  # --- Local Build Services ---
-  crawl4ai-amd64:
+  crawl4ai-local-amd64:
    build:
-      context: .
+      context: . # Build context is the root directory
-      dockerfile: Dockerfile
+      dockerfile: Dockerfile # Dockerfile is in the root directory
      args:
-        PYTHON_VERSION: "3.10"
+        INSTALL_TYPE: ${INSTALL_TYPE:-default}
-        INSTALL_TYPE: ${INSTALL_TYPE:-basic}
+        ENABLE_GPU: ${ENABLE_GPU:-false}
-        ENABLE_GPU: false
+        # PYTHON_VERSION arg is omitted as it's fixed by 'FROM python:3.10-slim' in Dockerfile
-      platforms:
+    platform: linux/amd64
        - linux/amd64
    profiles: ["local-amd64"]
-    <<: *base-config  # extends yerine doğrudan yapılandırmayı dahil ettik
+    <<: *base-config # Inherit base configuration
-  crawl4ai-arm64:
+  crawl4ai-local-arm64:
    build:
-      context: .
+      context: . # Build context is the root directory
-      dockerfile: Dockerfile
+      dockerfile: Dockerfile # Dockerfile is in the root directory
      args:
-        PYTHON_VERSION: "3.10"
+        INSTALL_TYPE: ${INSTALL_TYPE:-default}
-        INSTALL_TYPE: ${INSTALL_TYPE:-basic}
+        ENABLE_GPU: ${ENABLE_GPU:-false}
-        ENABLE_GPU: false
+    platform: linux/arm64
      platforms:
        - linux/arm64
    profiles: ["local-arm64"]
    <<: *base-config
-  # Hub services for different platforms and versions
+  # --- Docker Hub Image Services ---
  crawl4ai-hub-amd64:
-    image: unclecode/crawl4ai:${VERSION:-basic}-amd64
+    image: unclecode/crawl4ai:${VERSION:-latest}-amd64
    profiles: ["hub-amd64"]
    <<: *base-config
  crawl4ai-hub-arm64:
-    image: unclecode/crawl4ai:${VERSION:-basic}-arm64
+    image: unclecode/crawl4ai:${VERSION:-latest}-arm64
    profiles: ["hub-arm64"]
    <<: *base-config
--- a/docs/examples/network_console_capture_example.py
+++ b/docs/examples/network_console_capture_example.py
@@ -357,8 +357,7 @@ async def demo_performance_analysis():
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            capture_network_requests=True,
-            wait_until="networkidle",
+            page_timeout=60 * 2 * 1000  # 120 seconds
            page_timeout=60000  # 60 seconds
        )
        result = await crawler.arun(
@@ -406,6 +405,13 @@ async def demo_performance_analysis():
                            "url": url,
                            "duration_ms": duration
                        })
                    if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing:
                        # Convert to milliseconds
                        duration = (timing["responseStart"] - timing["requestStart"]) * 1000
                        resource_timings[resource_type].append({
                            "url": url,
                            "duration_ms": duration
                        })
                # Calculate statistics for each resource type
                print("\nPerformance by resource type:")
@@ -455,14 +461,14 @@ async def main():
    os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
    # Run basic examples
-    await demo_basic_network_capture()
+    # await demo_basic_network_capture()
    await demo_basic_console_capture()
-    await demo_combined_capture()
+    # await demo_combined_capture()
    # Run advanced examples
-    await analyze_spa_network_traffic()
+    # await analyze_spa_network_traffic()
-    await demo_security_analysis()
+    # await demo_security_analysis()
-    await demo_performance_analysis()
+    # await demo_performance_analysis()
    print("\n=== Examples Complete ===")
    print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
--- a/docs/examples/quickstart_examples_set_1.py
+++ b/docs/examples/quickstart_examples_set_1.py
@@ -4,7 +4,7 @@ import json
 import base64
 from pathlib import Path
 from typing import List
-from crawl4ai.proxy_strategy import ProxyConfig
+from crawl4ai import ProxyConfig
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
 from crawl4ai import RoundRobinProxyStrategy
--- a/docs/examples/tutorial_v0.5.py
+++ b/docs/examples/tutorial_v0.5.py
@@ -13,7 +13,7 @@ from crawl4ai.deep_crawling import (
 )
 from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
 from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
-from crawl4ai.proxy_strategy import ProxyConfig
+from crawl4ai import ProxyConfig
 from crawl4ai import RoundRobinProxyStrategy
 from crawl4ai.content_filter_strategy import LLMContentFilter
 from crawl4ai import DefaultMarkdownGenerator
--- a/docs/md_v2/ask_ai/ask-ai.css
+++ b/docs/md_v2/ask_ai/ask-ai.css
@@ -0,0 +1,444 @@
 /* ==== File: docs/ask_ai/ask_ai.css ==== */
 /* --- Basic Reset & Font --- */
 body {
    /* Attempt to inherit variables from parent window (iframe context) */
    /* Fallback values if variables are not inherited */
    --fallback-bg: #070708;
    --fallback-font: #e8e9ed;
    --fallback-secondary: #a3abba;
    --fallback-primary: #50ffff;
    --fallback-primary-dimmed: #09b5a5;
    --fallback-border: #1d1d20;
    --fallback-code-bg: #1e1e1e;
    --fallback-invert-font: #222225;
    --font-stack: dm, Monaco, Courier New, monospace, serif;
    font-family: var(--font-stack, "Courier New", monospace); /* Use theme font stack */
    background-color: var(--background-color, var(--fallback-bg));
    color: var(--font-color, var(--fallback-font));
    margin: 0;
    padding: 0;
    font-size: 14px; /* Match global font size */
    line-height: 1.5em; /* Match global line height */
    height: 100vh; /* Ensure body takes full height */
    overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
    display: flex; /* Use flex for the main container */
 }
 a {
    color: var(--secondary-color, var(--fallback-secondary));
    text-decoration: none;
    transition: color 0.2s;
 }
 a:hover {
    color: var(--primary-color, var(--fallback-primary));
 }
 /* --- Main Container Layout --- */
 .ai-assistant-container {
    display: flex;
    width: 100%;
    height: 100%;
    background-color: var(--background-color, var(--fallback-bg));
 }
 /* --- Sidebar Styling --- */
 .sidebar {
    flex-shrink: 0; /* Prevent sidebars from shrinking */
    height: 100%;
    display: flex;
    flex-direction: column;
    /* background-color: var(--code-bg-color, var(--fallback-code-bg)); */
    overflow-y: hidden; /* Header fixed, list scrolls */
 }
 .left-sidebar {
    flex-basis: 240px; /* Width of history panel */
    border-right: 1px solid var(--progress-bar-background, var(--fallback-border));
 }
 .right-sidebar {
    flex-basis: 280px; /* Width of citations panel */
    border-left: 1px solid var(--progress-bar-background, var(--fallback-border));
 }
 .sidebar header {
    padding: 0.6em 1em;
    border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
    flex-shrink: 0;
    display: flex;
    justify-content: space-between;
    align-items: center;
 }
 .sidebar header h3 {
    margin: 0;
    font-size: 1.1em;
    color: var(--font-color, var(--fallback-font));
 }
 .sidebar ul {
    list-style: none;
    padding: 0;
    margin: 0;
    overflow-y: auto; /* Enable scrolling for the list */
    flex-grow: 1; /* Allow list to take remaining space */
    padding: 0.5em 0;
 }
 .sidebar ul li {
    padding: 0.3em 1em;
 }
 .sidebar ul li.no-citations,
 .sidebar ul li.no-history {
    color: var(--secondary-color, var(--fallback-secondary));
    font-style: italic;
    font-size: 0.9em;
    padding-left: 1em;
 }
 .sidebar ul li a {
    color: var(--secondary-color, var(--fallback-secondary));
    text-decoration: none;
    display: block;
    padding: 0.2em 0.5em;
    border-radius: 3px;
    transition: background-color 0.2s, color 0.2s;
 }
 .sidebar ul li a:hover {
    color: var(--primary-color, var(--fallback-primary));
    background-color: rgba(80, 255, 255, 0.08); /* Use primary color with alpha */
 }
 /* Style for active history item */
 #history-list li.active a {
    color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
    font-weight: bold;
    background-color: rgba(80, 255, 255, 0.12);
 }
 /* --- Chat Panel Styling --- */
 #chat-panel {
    flex-grow: 1; /* Take remaining space */
    display: flex;
    flex-direction: column;
    height: 100%;
    overflow: hidden; /* Prevent overflow, internal elements handle scroll */
 }
 #chat-messages {
    flex-grow: 1;
    overflow-y: auto; /* Scrollable chat history */
    padding: 1em 1.5em;
    border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
 }
 .message {
    margin-bottom: 1em;
    padding: 0.8em 1.2em;
    border-radius: 8px;
    max-width: 90%; /* Slightly wider */
    line-height: 1.6;
    /* Apply pre-wrap for better handling of spaces/newlines AND wrapping */
    white-space: pre-wrap;
    word-wrap: break-word; /* Ensure long words break */
 }
 .user-message {
    background-color: var(--progress-bar-background, var(--fallback-border)); /* User message background */
    color: var(--font-color, var(--fallback-font));
    margin-left: auto; /* Align user messages to the right */
    text-align: left;
 }
 .ai-message {
    background-color: var(--code-bg-color, var(--fallback-code-bg)); /* AI message background */
    color: var(--font-color, var(--fallback-font));
    margin-right: auto; /* Align AI messages to the left */
    border: 1px solid var(--progress-bar-background, var(--fallback-border));
 }
 .ai-message.welcome-message {
    border: none;
    background-color: transparent;
    max-width: 100%;
    text-align: center;
    color: var(--secondary-color, var(--fallback-secondary));
    white-space: normal;
 }
 /* Styles for code within messages */
 .ai-message code {
    background-color: var(--invert-font-color, var(--fallback-invert-font)) !important; /* Use light bg for code */
    /* color: var(--background-color, var(--fallback-bg)) !important; Dark text */
    padding: 0.1em 0.4em;
    border-radius: 4px;
    font-size: 0.9em;
 }
 .ai-message pre {
    background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
    color: var(--background-color, var(--fallback-bg)) !important;
    padding: 1em;
    border-radius: 5px;
    overflow-x: auto;
    margin: 0.8em 0;
    white-space: pre;
 }
 .ai-message pre code {
    background-color: transparent !important;
    padding: 0;
    font-size: inherit;
 }
 /* Override white-space for specific elements generated by Markdown */
 .ai-message p,
 .ai-message ul,
 .ai-message ol,
 .ai-message blockquote {
    white-space: normal; /* Allow standard wrapping for block elements */
 }
 /* --- Markdown Element Styling within Messages --- */
 .message p {
    margin-top: 0;
    margin-bottom: 0.5em;
 }
 .message p:last-child {
    margin-bottom: 0;
 }
 .message ul,
 .message ol {
    margin: 0.5em 0 0.5em 1.5em;
    padding: 0;
 }
 .message li {
    margin-bottom: 0.2em;
 }
 /* Code block styling (adjusts previous rules slightly) */
 .message code {
    /* Inline code */
    background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
    color: var(--font-color);
    padding: 0.1em 0.4em;
    border-radius: 4px;
    font-size: 0.9em;
    /* Ensure inline code breaks nicely */
    word-break: break-all;
    white-space: normal; /* Allow inline code to wrap if needed */
 }
 .message pre {
    /* Code block container */
    background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
    color: var(--background-color, var(--fallback-bg)) !important;
    padding: 1em;
    border-radius: 5px;
    overflow-x: auto;
    margin: 0.8em 0;
    font-size: 0.9em; /* Slightly smaller code blocks */
 }
 .message pre code {
    /* Code within code block */
    background-color: transparent !important;
    padding: 0;
    font-size: inherit;
    word-break: normal; /* Don't break words in code blocks */
    white-space: pre; /* Preserve whitespace strictly in code blocks */
 }
 /* Thinking indicator */
 .message-thinking {
    display: inline-block;
    width: 5px;
    height: 5px;
    background-color: var(--primary-color, var(--fallback-primary));
    border-radius: 50%;
    margin-left: 8px;
    vertical-align: middle;
    animation: thinking 1s infinite ease-in-out;
 }
@keyframes thinking {
    0%,
    100% {
        opacity: 0.5;
        transform: scale(0.8);
    }
    50% {
        opacity: 1;
        transform: scale(1.2);
    }
 }
 /* --- Thinking Indicator (Blinking Cursor Style) --- */
 .thinking-indicator-cursor {
    display: inline-block;
    width: 10px; /* Width of the cursor */
    height: 1.1em; /* Match line height */
    background-color: var(--primary-color, var(--fallback-primary));
    margin-left: 5px;
    vertical-align: text-bottom; /* Align with text baseline */
    animation: blink-cursor 1s step-end infinite;
 }
@keyframes blink-cursor {
    from,
    to {
        background-color: transparent;
    }
    50% {
        background-color: var(--primary-color, var(--fallback-primary));
    }
 }
 #chat-input-area {
    flex-shrink: 0; /* Prevent input area from shrinking */
    padding: 1em 1.5em;
    display: flex;
    align-items: flex-end; /* Align items to bottom */
    gap: 10px;
    background-color: var(--code-bg-color, var(--fallback-code-bg)); /* Match sidebars */
 }
 #chat-input-area textarea {
    flex-grow: 1;
    padding: 0.8em 1em;
    border: 1px solid var(--progress-bar-background, var(--fallback-border));
    background-color: var(--background-color, var(--fallback-bg));
    color: var(--font-color, var(--fallback-font));
    border-radius: 5px;
    resize: none; /* Disable manual resize */
    font-family: inherit;
    font-size: 1em;
    line-height: 1.4;
    max-height: 150px; /* Limit excessive height */
    overflow-y: auto;
    /* rows: 2; */
 }
 #chat-input-area button {
    /* Basic button styling - maybe inherit from main theme? */
    padding: 0.6em 1.2em;
    border: 1px solid var(--primary-dimmed-color, var(--fallback-primary-dimmed));
    background-color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
    color: var(--background-color, var(--fallback-bg));
    border-radius: 5px;
    cursor: pointer;
    font-size: 0.9em;
    transition: background-color 0.2s, border-color 0.2s;
    height: min-content; /* Align with bottom of textarea */
 }
 #chat-input-area button:hover {
    background-color: var(--primary-color, var(--fallback-primary));
    border-color: var(--primary-color, var(--fallback-primary));
 }
 #chat-input-area button:disabled {
    opacity: 0.6;
    cursor: not-allowed;
 }
 .loading-indicator {
    font-size: 0.9em;
    color: var(--secondary-color, var(--fallback-secondary));
    margin-right: 10px;
    align-self: center;
 }
 /* --- Buttons --- */
 /* Inherit some button styles if possible */
 .btn.btn-sm {
    color: var(--font-color, var(--fallback-font));
    padding: 0.2em 0.5em;
    font-size: 0.8em;
    border: 1px solid var(--secondary-color, var(--fallback-secondary));
    background: none;
    border-radius: 3px;
    cursor: pointer;
 }
 .btn.btn-sm:hover {
    border-color: var(--font-color, var(--fallback-font));
    background-color: var(--progress-bar-background, var(--fallback-border));
 }
 /* --- Basic Responsiveness --- */
@media screen and (max-width: 900px) {
    .left-sidebar {
        flex-basis: 200px; /* Shrink history */
    }
    .right-sidebar {
        flex-basis: 240px; /* Shrink citations */
    }
 }
@media screen and (max-width: 768px) {
    /* Stack layout on mobile? Or hide sidebars? Hiding for now */
    .sidebar {
        display: none; /* Hide sidebars on small screens */
    }
    /* Could add toggle buttons later */
 }
 /* ==== File: docs/ask_ai/ask-ai.css (Updates V4 - Delete Button) ==== */
 .sidebar ul li {
    /* Use flexbox to align link and delete button */
    display: flex;
    justify-content: space-between;
    align-items: center;
    padding: 0; /* Remove padding from li, add to link/button */
    margin: 0.1em 0; /* Small vertical margin */
 }
 .sidebar ul li a {
    /* Link takes most space */
    flex-grow: 1;
    padding: 0.3em 0.5em 0.3em 1em; /* Adjust padding */
    /* Make ellipsis work for long titles */
    white-space: nowrap;
    overflow: hidden;
    text-overflow: ellipsis;
    /* Keep existing link styles */
    color: var(--secondary-color, var(--fallback-secondary));
    text-decoration: none;
    display: block;
    border-radius: 3px;
    transition: background-color 0.2s, color 0.2s;
 }
 .sidebar ul li a:hover {
    color: var(--primary-color, var(--fallback-primary));
    background-color: rgba(80, 255, 255, 0.08);
 }
 /* Style for active history item's link */
 #history-list li.active a {
    color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
    font-weight: bold;
    background-color: rgba(80, 255, 255, 0.12);
 }
 /* --- Delete Chat Button --- */
 .delete-chat-btn {
    flex-shrink: 0; /* Don't shrink */
    background: none;
    border: none;
    color: var(--secondary-color, var(--fallback-secondary));
    cursor: pointer;
    padding: 0.4em 0.8em; /* Padding around icon */
    font-size: 0.9em;
    opacity: 0.5; /* Dimmed by default */
    transition: opacity 0.2s, color 0.2s;
    margin-left: 5px; /* Space between link and button */
    border-radius: 3px;
 }
 .sidebar ul li:hover .delete-chat-btn,
 .delete-chat-btn:hover {
    opacity: 1; /* Show fully on hover */
    color: var(--error-color, #ff3c74); /* Use error color on hover */
 }
 .delete-chat-btn:focus {
    outline: 1px dashed var(--error-color, #ff3c74); /* Accessibility */
     opacity: 1;
 }
--- a/docs/md_v2/ask_ai/ask-ai.js
+++ b/docs/md_v2/ask_ai/ask-ai.js
@@ -0,0 +1,603 @@
 // ==== File: docs/ask_ai/ask-ai.js (Marked, Streaming, History) ====
 document.addEventListener("DOMContentLoaded", () => {
    console.log("AI Assistant JS V2 Loaded");
    // --- DOM Element Selectors ---
    const historyList = document.getElementById("history-list");
    const newChatButton = document.getElementById("new-chat-button");
    const chatMessages = document.getElementById("chat-messages");
    const chatInput = document.getElementById("chat-input");
    const sendButton = document.getElementById("send-button");
    const citationsList = document.getElementById("citations-list");
    // --- Constants ---
    const CHAT_INDEX_KEY = "aiAssistantChatIndex_v1";
    const CHAT_PREFIX = "aiAssistantChat_v1_";
    // --- State ---
    let currentChatId = null;
    let conversationHistory = []; // Holds message objects { sender: 'user'/'ai', text: '...' }
    let isThinking = false;
    let streamInterval = null; // To control the streaming interval
    // --- Event Listeners ---
    sendButton.addEventListener("click", handleSendMessage);
    chatInput.addEventListener("keydown", handleInputKeydown);
    newChatButton.addEventListener("click", handleNewChat);
    chatInput.addEventListener("input", autoGrowTextarea);
    // --- Initialization ---
    loadChatHistoryIndex(); // Load history list on startup
    const initialQuery = checkForInitialQuery(window.parent.location); // Check for query param
    if (!initialQuery) {
        loadInitialChat(); // Load normally if no query
    }
    // --- Core Functions ---
    function handleSendMessage() {
        const userMessageText = chatInput.value.trim();
        if (!userMessageText || isThinking) return;
        setThinking(true); // Start thinking state
        // Add user message to state and UI
        const userMessage = { sender: "user", text: userMessageText };
        conversationHistory.push(userMessage);
        addMessageToChat(userMessage, false); // Add user message without parsing markdown
        chatInput.value = "";
        autoGrowTextarea(); // Reset textarea height
        // Prepare for AI response (create empty div)
        const aiMessageDiv = addMessageToChat({ sender: "ai", text: "" }, true); // Add empty div with thinking indicator
        // TODO: Generate fingerprint/JWT here
        // TODO: Send `conversationHistory` + JWT to backend API
        // Replace placeholder below with actual API call
        // The backend should ideally return a stream of text tokens
        // --- Placeholder Streaming Simulation ---
        const simulatedFullResponse = `Okay, Here’s a minimal Python script that creates an AsyncWebCrawler, fetches a webpage, and prints the first 300 characters of its Markdown output:
 \`\`\`python
 import asyncio
 from crawl4ai import AsyncWebCrawler
 async def main():
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun("https://example.com")
        print(result.markdown[:300])  # Print first 300 chars
 if __name__ == "__main__":
    asyncio.run(main())
 \`\`\`
 A code snippet: \`crawler.run()\`. Check the [quickstart](/core/quickstart).`;
        // Simulate receiving the response stream
        streamSimulatedResponse(aiMessageDiv, simulatedFullResponse);
        // // Simulate receiving citations *after* stream starts (or with first chunk)
        // setTimeout(() => {
        //     addCitations([
        //         { title: "Simulated Doc 1", url: "#sim1" },
        //         { title: "Another Concept", url: "#sim2" },
        //     ]);
        // }, 500); // Citations appear shortly after thinking starts
    }
    function handleInputKeydown(event) {
        if (event.key === "Enter" && !event.shiftKey) {
            event.preventDefault();
            handleSendMessage();
        }
    }
    function addMessageToChat(message, addThinkingIndicator = false) {
        const messageDiv = document.createElement("div");
        messageDiv.classList.add("message", `${message.sender}-message`);
        // Parse markdown and set HTML
        messageDiv.innerHTML = message.text ? marked.parse(message.text) : "";
        if (message.sender === "ai") {
            // Apply Syntax Highlighting AFTER setting innerHTML
            messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
                if (typeof hljs !== "undefined") {
                    // Check if already highlighted to prevent double-highlighting issues
                    if (!block.classList.contains("hljs")) {
                        hljs.highlightElement(block);
                    }
                } else {
                    console.warn("highlight.js (hljs) not found for syntax highlighting.");
                }
            });
            // Add thinking indicator if needed (and not already present)
            if (addThinkingIndicator && !message.text && !messageDiv.querySelector(".thinking-indicator-cursor")) {
                const thinkingDiv = document.createElement("div");
                thinkingDiv.className = "thinking-indicator-cursor";
                messageDiv.appendChild(thinkingDiv);
            }
        } else {
            // User messages remain plain text
            // messageDiv.textContent = message.text;
        }
        // wrap each pre in a div.terminal
        messageDiv.querySelectorAll("pre").forEach((block) => {
            const wrapper = document.createElement("div");
            wrapper.className = "terminal";
            block.parentNode.insertBefore(wrapper, block);
            wrapper.appendChild(block);
        });
        chatMessages.appendChild(messageDiv);
        // Scroll only if user is near the bottom? (More advanced)
        // Simple scroll for now:
        scrollToBottom();
        return messageDiv; // Return the created element
    }
    function streamSimulatedResponse(messageDiv, fullText) {
        const thinkingIndicator = messageDiv.querySelector(".thinking-indicator-cursor");
        if (thinkingIndicator) thinkingIndicator.remove();
        const tokens = fullText.split(/(\s+)/);
        let currentText = "";
        let tokenIndex = 0;
        // Clear previous interval just in case
        if (streamInterval) clearInterval(streamInterval);
        streamInterval = setInterval(() => {
            const cursorSpan = '<span class="thinking-indicator-cursor"></span>'; // Cursor for streaming
            if (tokenIndex < tokens.length) {
                currentText += tokens[tokenIndex];
                // Render intermediate markdown + cursor
                messageDiv.innerHTML = marked.parse(currentText + cursorSpan);
                // Re-highlight code blocks on each stream update - might be slightly inefficient
                // but ensures partial code blocks look okay. Highlight only final on completion.
                // messageDiv.querySelectorAll('pre code:not(.hljs)').forEach((block) => {
                //     hljs.highlightElement(block);
                // });
                scrollToBottom(); // Keep scrolling as content streams
                tokenIndex++;
            } else {
                // Streaming finished
                clearInterval(streamInterval);
                streamInterval = null;
                // Final render without cursor
                messageDiv.innerHTML = marked.parse(currentText);
                // === Final Syntax Highlighting ===
                messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
                    if (typeof hljs !== "undefined" && !block.classList.contains("hljs")) {
                        hljs.highlightElement(block);
                    }
                });
                // === Extract Citations ===
                const citations = extractMarkdownLinks(currentText);
                // Wrap each pre in a div.terminal
                messageDiv.querySelectorAll("pre").forEach((block) => {
                    const wrapper = document.createElement("div");
                    wrapper.className = "terminal";
                    block.parentNode.insertBefore(wrapper, block);
                    wrapper.appendChild(block);
                });
                const aiMessage = { sender: "ai", text: currentText, citations: citations };
                conversationHistory.push(aiMessage);
                updateCitationsDisplay();
                saveCurrentChat();
                setThinking(false);
            }
        }, 50); // Adjust speed
    }
    // === NEW Function to Extract Links ===
    function extractMarkdownLinks(markdownText) {
        const regex = /\[([^\]]+)\]\(([^)]+)\)/g; // [text](url)
        const citations = [];
        let match;
        while ((match = regex.exec(markdownText)) !== null) {
            // Avoid adding self-links from within the citations list if AI includes them
            if (!match[2].startsWith("#citation-")) {
                citations.push({
                    title: match[1].trim(),
                    url: match[2].trim(),
                });
            }
        }
        // Optional: Deduplicate links based on URL
        const uniqueCitations = citations.filter(
            (citation, index, self) => index === self.findIndex((c) => c.url === citation.url)
        );
        return uniqueCitations;
    }
    // === REVISED Function to Display Citations ===
    function updateCitationsDisplay() {
        let lastCitations = null;
        // Find the most recent AI message with citations
        for (let i = conversationHistory.length - 1; i >= 0; i--) {
            if (
                conversationHistory[i].sender === "ai" &&
                conversationHistory[i].citations &&
                conversationHistory[i].citations.length > 0
            ) {
                lastCitations = conversationHistory[i].citations;
                break; // Found the latest citations
            }
        }
        citationsList.innerHTML = ""; // Clear previous
        if (!lastCitations) {
            citationsList.innerHTML = '<li class="no-citations">No citations available.</li>';
            return;
        }
        lastCitations.forEach((citation, index) => {
            const li = document.createElement("li");
            const a = document.createElement("a");
            // Generate a unique ID for potential internal linking if needed
            // a.id = `citation-${index}`;
            a.href = citation.url || "#";
            a.textContent = citation.title;
            a.target = "_top"; // Open in main window
            li.appendChild(a);
            citationsList.appendChild(li);
        });
    }
    function addCitations(citations) {
        citationsList.innerHTML = ""; // Clear
        if (!citations || citations.length === 0) {
            citationsList.innerHTML = '<li class="no-citations">No citations available.</li>';
            return;
        }
        citations.forEach((citation) => {
            const li = document.createElement("li");
            const a = document.createElement("a");
            a.href = citation.url || "#";
            a.textContent = citation.title;
            a.target = "_top"; // Open in main window
            li.appendChild(a);
            citationsList.appendChild(li);
        });
    }
    function setThinking(thinking) {
        isThinking = thinking;
        sendButton.disabled = thinking;
        chatInput.disabled = thinking;
        chatInput.placeholder = thinking ? "AI is responding..." : "Ask about Crawl4AI...";
        // Stop any existing stream if we start thinking again (e.g., rapid resend)
        if (thinking && streamInterval) {
            clearInterval(streamInterval);
            streamInterval = null;
        }
    }
    function autoGrowTextarea() {
        chatInput.style.height = "auto";
        chatInput.style.height = `${chatInput.scrollHeight}px`;
    }
    function scrollToBottom() {
        chatMessages.scrollTop = chatMessages.scrollHeight;
    }
    // --- Query Parameter Handling ---
    function checkForInitialQuery(locationToCheck) {
        // <-- Receive location object
        if (!locationToCheck) {
            console.warn("Ask AI: Could not access parent window location.");
            return false;
        }
        const urlParams = new URLSearchParams(locationToCheck.search); // <-- Use passed location's search string
        const encodedQuery = urlParams.get("qq"); // <-- Use 'qq'
        if (encodedQuery) {
            console.log("Initial query found (qq):", encodedQuery);
            try {
                const decodedText = decodeURIComponent(escape(atob(encodedQuery)));
                console.log("Decoded query:", decodedText);
                // Start new chat immediately
                handleNewChat(true);
                // Delay setting input and sending message slightly
                setTimeout(() => {
                    chatInput.value = decodedText;
                    autoGrowTextarea();
                    handleSendMessage();
                    // Clean the PARENT window's URL
                    try {
                        const cleanUrl = locationToCheck.pathname;
                        // Use parent's history object
                        window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
                    } catch (e) {
                        console.warn("Ask AI: Could not clean parent URL using replaceState.", e);
                        // This might fail due to cross-origin restrictions if served differently,
                        // but should work fine with mkdocs serve on the same origin.
                    }
                }, 100);
                return true; // Query processed
            } catch (e) {
                console.error("Error decoding initial query (qq):", e);
                // Clean the PARENT window's URL even on error
                try {
                    const cleanUrl = locationToCheck.pathname;
                    window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
                } catch (cleanError) {
                    console.warn("Ask AI: Could not clean parent URL after decode error.", cleanError);
                }
                return false;
            }
        }
        return false; // No 'qq' query found
    }
    // --- History Management ---
    function handleNewChat(isFromQuery = false) {
        if (isThinking) return; // Don't allow new chat while responding
        // Only save if NOT triggered immediately by a query parameter load
        if (!isFromQuery) {
            saveCurrentChat();
        }
        currentChatId = `chat_${Date.now()}`;
        conversationHistory = []; // Clear message history state
        chatMessages.innerHTML = ""; // Start with clean slate for query
        if (!isFromQuery) {
            // Show welcome only if manually started
            chatMessages.innerHTML =
                '<div class="message ai-message welcome-message">Started a new chat! Ask me anything about Crawl4AI.</div>';
        }
        addCitations([]); // Clear citations
        updateCitationsDisplay(); // Clear UI
        // Add to index and save
        let index = loadChatIndex();
        // Generate a generic title initially, update later
        const newTitle = isFromQuery ? "Chat from Selection" : `Chat ${new Date().toLocaleString()}`;
        // index.unshift({ id: currentChatId, title: `Chat ${new Date().toLocaleString()}` }); // Add to start
        index.unshift({ id: currentChatId, title: newTitle });
        saveChatIndex(index);
        renderHistoryList(index); // Update UI
        setActiveHistoryItem(currentChatId);
        saveCurrentChat(); // Save the empty new chat state
    }
    function loadChat(chatId) {
        if (isThinking || chatId === currentChatId) return;
        // Check if chat data actually exists before proceeding
        const storedChat = localStorage.getItem(CHAT_PREFIX + chatId);
        if (storedChat === null) {
            console.warn(`Attempted to load non-existent chat: ${chatId}. Removing from index.`);
            deleteChatData(chatId); // Clean up index
            loadChatHistoryIndex(); // Reload history list
            loadInitialChat(); // Load next available chat
            return;
        }
        console.log(`Loading chat: ${chatId}`);
        saveCurrentChat(); // Save current before switching
        try {
            conversationHistory = JSON.parse(storedChat);
            currentChatId = chatId;
            renderChatMessages(conversationHistory);
            updateCitationsDisplay();
            setActiveHistoryItem(chatId);
        } catch (e) {
            console.error("Error loading chat:", chatId, e);
            alert("Failed to load chat data.");
            conversationHistory = [];
            renderChatMessages(conversationHistory);
            updateCitationsDisplay();
        }
    }
    function saveCurrentChat() {
        if (currentChatId && conversationHistory.length > 0) {
            try {
                localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify(conversationHistory));
                console.log(`Chat ${currentChatId} saved.`);
                // Update title in index (e.g., use first user message)
                let index = loadChatIndex();
                const currentItem = index.find((item) => item.id === currentChatId);
                if (
                    currentItem &&
                    conversationHistory[0]?.sender === "user" &&
                    !currentItem.title.startsWith("Chat about:")
                ) {
                    currentItem.title = `Chat about: ${conversationHistory[0].text.substring(0, 30)}...`;
                    saveChatIndex(index);
                    // Re-render history list if title changed - small optimization needed here maybe
                    renderHistoryList(index);
                    setActiveHistoryItem(currentChatId); // Re-set active after re-render
                }
            } catch (e) {
                console.error("Error saving chat:", currentChatId, e);
                // Handle potential storage full errors
                if (e.name === "QuotaExceededError") {
                    alert("Local storage is full. Cannot save chat history.");
                    // Consider implementing history pruning logic here
                }
            }
        } else if (currentChatId) {
            // Save empty state for newly created chats if needed, or remove?
            localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify([]));
        }
    }
    function loadChatIndex() {
        try {
            const storedIndex = localStorage.getItem(CHAT_INDEX_KEY);
            return storedIndex ? JSON.parse(storedIndex) : [];
        } catch (e) {
            console.error("Error loading chat index:", e);
            return []; // Return empty array on error
        }
    }
    function saveChatIndex(indexArray) {
        try {
            localStorage.setItem(CHAT_INDEX_KEY, JSON.stringify(indexArray));
        } catch (e) {
            console.error("Error saving chat index:", e);
        }
    }
    function renderHistoryList(indexArray) {
        historyList.innerHTML = ""; // Clear existing
        if (!indexArray || indexArray.length === 0) {
            historyList.innerHTML = '<li class="no-history">No past chats found.</li>';
            return;
        }
        indexArray.forEach((item) => {
            const li = document.createElement("li");
            li.dataset.chatId = item.id; // Add ID to li for easier selection
            const a = document.createElement("a");
            a.href = "#";
            a.dataset.chatId = item.id;
            a.textContent = item.title || `Chat ${item.id.split("_")[1] || item.id}`;
            a.title = a.textContent; // Tooltip for potentially long titles
            a.addEventListener("click", (e) => {
                e.preventDefault();
                loadChat(item.id);
            });
            // === Add Delete Button ===
            const deleteBtn = document.createElement("button");
            deleteBtn.className = "delete-chat-btn";
            deleteBtn.innerHTML = "✕"; // Trash can emoji/icon (or use text/SVG/FontAwesome)
            deleteBtn.title = "Delete Chat";
            deleteBtn.dataset.chatId = item.id; // Store ID on button too
            deleteBtn.addEventListener("click", handleDeleteChat);
            li.appendChild(a);
            li.appendChild(deleteBtn); // Append button to the list item
            historyList.appendChild(li);
        });
    }
    function renderChatMessages(messages) {
        chatMessages.innerHTML = ""; // Clear existing messages
        messages.forEach((message) => {
            // Ensure highlighting is applied when loading from history
            addMessageToChat(message, false);
        });
        if (messages.length === 0) {
            chatMessages.innerHTML =
                '<div class="message ai-message welcome-message">Chat history loaded. Ask a question!</div>';
        }
        // Scroll to bottom after loading messages
        scrollToBottom();
    }
    function setActiveHistoryItem(chatId) {
        document.querySelectorAll("#history-list li").forEach((li) => li.classList.remove("active"));
        // Select the LI element directly now
        const activeLi = document.querySelector(`#history-list li[data-chat-id="${chatId}"]`);
        if (activeLi) {
            activeLi.classList.add("active");
        }
    }
    function loadInitialChat() {
        const index = loadChatIndex();
        if (index.length > 0) {
            loadChat(index[0].id);
        } else {
            // Check if handleNewChat wasn't already called by query handler
            if (!currentChatId) {
                handleNewChat();
            }
        }
    }
    function loadChatHistoryIndex() {
        const index = loadChatIndex();
        renderHistoryList(index);
        if (currentChatId) setActiveHistoryItem(currentChatId);
    }
    // === NEW Function to Handle Delete Click ===
    function handleDeleteChat(event) {
        event.stopPropagation(); // Prevent triggering loadChat on the link behind it
        const button = event.currentTarget;
        const chatIdToDelete = button.dataset.chatId;
        if (!chatIdToDelete) return;
        // Confirmation dialog
        if (
            window.confirm(
                `Are you sure you want to delete this chat session?\n"${
                    button.previousElementSibling?.textContent || "Chat " + chatIdToDelete
                }"`
            )
        ) {
            console.log(`Deleting chat: ${chatIdToDelete}`);
            // Perform deletion
            const updatedIndex = deleteChatData(chatIdToDelete);
            // If the deleted chat was the currently active one, load another chat
            if (currentChatId === chatIdToDelete) {
                currentChatId = null; // Reset current ID
                conversationHistory = []; // Clear state
                if (updatedIndex.length > 0) {
                    // Load the new top chat (most recent remaining)
                    loadChat(updatedIndex[0].id);
                } else {
                    // No chats left, start a new one
                    handleNewChat();
                }
            } else {
                // If a different chat was deleted, just re-render the list
                renderHistoryList(updatedIndex);
                // Re-apply active state in case IDs shifted (though they shouldn't)
                setActiveHistoryItem(currentChatId);
            }
        }
    }
    // === NEW Function to Delete Chat Data ===
    function deleteChatData(chatId) {
        // Remove chat data
        localStorage.removeItem(CHAT_PREFIX + chatId);
        // Update index
        let index = loadChatIndex();
        index = index.filter((item) => item.id !== chatId);
        saveChatIndex(index);
        console.log(`Chat ${chatId} data and index entry removed.`);
        return index; // Return the updated index
    }
    // --- Virtual Scrolling Placeholder ---
    // NOTE: Virtual scrolling is complex. For now, we do direct rendering.
    // If performance becomes an issue with very long chats/history,
    // investigate libraries like 'simple-virtual-scroll' or 'virtual-scroller'.
    // You would replace parts of `renderChatMessages` and `renderHistoryList`
    // to work with the chosen library's API (providing data and item renderers).
    console.warn("Virtual scrolling not implemented. Performance may degrade with very long chat histories.");
 });
--- a/docs/md_v2/ask_ai/index.html
+++ b/docs/md_v2/ask_ai/index.html
@@ -0,0 +1,64 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Crawl4AI Assistant</title>
    <!-- Link main styles first for variable access -->
    <link rel="stylesheet" href="../assets/layout.css">
    <link rel="stylesheet" href="../assets/styles.css">
    <!-- Link specific AI styles -->
    <link rel="stylesheet" href="../assets/highlight.css">
    <link rel="stylesheet" href="ask-ai.css">
 </head>
 <body>
    <div class="ai-assistant-container">
        <!-- Left Sidebar: Conversation History -->
        <aside id="history-panel" class="sidebar left-sidebar">
            <header>
                <h3>History</h3>
                <button id="new-chat-button" class="btn btn-sm">New Chat</button>
            </header>
            <ul id="history-list">
                <!-- History items populated by JS -->
            </ul>
        </aside>
        <!-- Main Area: Chat Interface -->
        <main id="chat-panel">
            <div id="chat-messages">
                <!-- Chat messages populated by JS -->
                 <div class="message ai-message welcome-message">
                    Welcome to the Crawl4AI Assistant! How can I help you today?
                 </div>
            </div>
            <div id="chat-input-area">
                <!-- Loading indicator for general waiting (optional) -->
                <!-- <div class="loading-indicator" style="display: none;">Thinking...</div> -->
                <textarea id="chat-input" placeholder="Ask about Crawl4AI..." rows="2"></textarea> 
                <button id="send-button">Send</button>
            </div>
        </main>
        <!-- Right Sidebar: Citations / Context -->
        <aside id="citations-panel" class="sidebar right-sidebar">
            <header>
                <h3>Citations</h3>
            </header>
            <ul id="citations-list">
                <!-- Citations populated by JS -->
                <li class="no-citations">No citations for this response yet.</li>
            </ul>
        </aside>
    </div>
    <!-- Include Marked.js library -->
    <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
    <script src="../assets/highlight.min.js"></script> 
    <!-- Your AI Assistant Logic -->
    <script src="ask-ai.js"></script>
 </body>
 </html>
--- a/docs/md_v2/assets/copy_code.js
+++ b/docs/md_v2/assets/copy_code.js
@@ -0,0 +1,62 @@
 // ==== File: docs/assets/copy_code.js ====
 document.addEventListener('DOMContentLoaded', () => {
    // Target specifically code blocks within the main content area
    const codeBlocks = document.querySelectorAll('#terminal-mkdocs-main-content pre > code');
    codeBlocks.forEach((codeElement) => {
        const preElement = codeElement.parentElement; // The <pre> tag
        // Ensure the <pre> tag can contain a positioned button
        if (window.getComputedStyle(preElement).position === 'static') {
            preElement.style.position = 'relative';
        }
        // Create the button
        const copyButton = document.createElement('button');
        copyButton.className = 'copy-code-button';
        copyButton.type = 'button';
        copyButton.setAttribute('aria-label', 'Copy code to clipboard');
        copyButton.title = 'Copy code to clipboard';
        copyButton.innerHTML = 'Copy'; // Or use an icon like an SVG or FontAwesome class
        // Append the button to the <pre> element
        preElement.appendChild(copyButton);
        // Add click event listener
        copyButton.addEventListener('click', () => {
            copyCodeToClipboard(codeElement, copyButton);
        });
    });
    async function copyCodeToClipboard(codeElement, button) {
        // Use innerText to get the rendered text content, preserving line breaks
        const textToCopy = codeElement.innerText;
        try {
            await navigator.clipboard.writeText(textToCopy);
            // Visual feedback
            button.innerHTML = 'Copied!';
            button.classList.add('copied');
            button.disabled = true; // Temporarily disable
            // Revert button state after a short delay
            setTimeout(() => {
                button.innerHTML = 'Copy';
                button.classList.remove('copied');
                button.disabled = false;
            }, 2000); // Show "Copied!" for 2 seconds
        } catch (err) {
            console.error('Failed to copy code: ', err);
            // Optional: Provide error feedback on the button
            button.innerHTML = 'Error';
            setTimeout(() => {
                button.innerHTML = 'Copy';
            }, 2000);
        }
    }
    console.log("Copy Code Button script loaded.");
 });
--- a/docs/md_v2/assets/floating_ask_ai_button.js
+++ b/docs/md_v2/assets/floating_ask_ai_button.js
@@ -0,0 +1,39 @@
 // ==== File: docs/assets/floating_ask_ai_button.js ====
 document.addEventListener('DOMContentLoaded', () => {
    const askAiPagePath = '/core/ask-ai/'; // IMPORTANT: Adjust this path if needed!
    const currentPath = window.location.pathname;
    // Determine the base URL for constructing the link correctly,
    // especially if deployed in a sub-directory.
    // This assumes a simple structure; adjust if needed.
    const baseUrl = window.location.origin + (currentPath.startsWith('/core/') ? '../..' : '');
    // Check if the current page IS the Ask AI page
    // Use includes() for flexibility (handles trailing slash or .html)
    if (currentPath.includes(askAiPagePath.replace(/\/$/, ''))) { // Remove trailing slash for includes check
        console.log("Floating Ask AI Button: Not adding button on the Ask AI page itself.");
        return; // Don't add the button on the target page
    }
    // --- Create the button ---
    const fabLink = document.createElement('a');
    fabLink.className = 'floating-ask-ai-button';
    fabLink.href = askAiPagePath; // Construct the correct URL
    fabLink.title = 'Ask Crawl4AI Assistant';
    fabLink.setAttribute('aria-label', 'Ask Crawl4AI Assistant');
    // Add content (using SVG icon for better visuals)
    fabLink.innerHTML = `
        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor">
            <path d="M20 2H4c-1.1 0-2 .9-2 2v12c0 1.1.9 2 2 2h14l4 4V4c0-1.1-.9-2-2-2zm-2 12H6v-2h12v2zm0-3H6V9h12v2zm0-3H6V6h12v2z"/>
        </svg>
        <span>Ask AI</span>
    `;
    // Append to body
    document.body.appendChild(fabLink);
    console.log("Floating Ask AI Button added.");
 });
--- a/docs/md_v2/assets/github_stats.js
+++ b/docs/md_v2/assets/github_stats.js
@@ -0,0 +1,119 @@
 // ==== File: assets/github_stats.js ====
 document.addEventListener('DOMContentLoaded', async () => {
    // --- Configuration ---
    const targetHeaderSelector = '.terminal .container:first-child'; // Selector for your header container
    const insertBeforeSelector = '.terminal-nav'; // Selector for the element to insert the badge BEFORE (e.g., the main nav)
                                                  // Or set to null to append at the end of the header.
    // --- Find elements ---
    const headerContainer = document.querySelector(targetHeaderSelector);
    if (!headerContainer) {
        console.warn('GitHub Stats: Header container not found with selector:', targetHeaderSelector);
        return;
    }
    const repoLinkElement = headerContainer.querySelector('a[href*="github.com/"]'); // Find the existing GitHub link
    let repoUrl = 'https://github.com/unclecode/crawl4ai';
    // if (repoLinkElement) {
    //     repoUrl = repoLinkElement.href;
    // } else {
    //     // Fallback: Try finding from config (requires template injection - harder)
    //     // Or hardcode if necessary, but reading from the link is better.
    //      console.warn('GitHub Stats: GitHub repo link not found in header.');
    //      // Try to get repo_url from mkdocs config if available globally (less likely)
    //      // repoUrl = window.mkdocs_config?.repo_url; // Requires setting this variable
    //      // if (!repoUrl) return; // Exit if still no URL
    //      return; // Exit for now if link isn't found
    // }
    // --- Extract Repo Owner/Name ---
    let owner = '';
    let repo = '';
    try {
        const url = new URL(repoUrl);
        const pathParts = url.pathname.split('/').filter(part => part.length > 0);
        if (pathParts.length >= 2) {
            owner = pathParts[0];
            repo = pathParts[1];
        }
    } catch (e) {
        console.error('GitHub Stats: Could not parse repository URL:', repoUrl, e);
        return;
    }
    if (!owner || !repo) {
        console.warn('GitHub Stats: Could not extract owner/repo from URL:', repoUrl);
        return;
    }
    // --- Get Version (Attempt to extract from site title) ---
    let version = '';
    const siteTitleElement = headerContainer.querySelector('.terminal-title, .site-title'); // Adjust selector based on theme's title element
    // Example title: "Crawl4AI Documentation (v0.5.x)"
    if (siteTitleElement) {
         const match = siteTitleElement.textContent.match(/\((v?[^)]+)\)/); // Look for text in parentheses starting with 'v' (optional)
         if (match && match[1]) {
             version = match[1].trim();
         }
    }
     if (!version) {
        console.info('GitHub Stats: Could not extract version from title. You might need to adjust the selector or regex.');
        // You could fallback to config.extra.version if injected into JS
        // version = window.mkdocs_config?.extra?.version || 'N/A';
     }
    // --- Fetch GitHub API Data ---
    let stars = '...';
    let forks = '...';
    try {
        const apiUrl = `https://api.github.com/repos/${owner}/${repo}`;
        const response = await fetch(apiUrl);
        if (response.ok) {
            const data = await response.json();
            // Format large numbers (optional)
            stars = data.stargazers_count > 1000 ? `${(data.stargazers_count / 1000).toFixed(1)}k` : data.stargazers_count;
            forks = data.forks_count > 1000 ? `${(data.forks_count / 1000).toFixed(1)}k` : data.forks_count;
        } else {
            console.warn(`GitHub Stats: API request failed with status ${response.status}. Rate limit exceeded?`);
            stars = 'N/A';
            forks = 'N/A';
        }
    } catch (error) {
        console.error('GitHub Stats: Error fetching repository data:', error);
        stars = 'N/A';
        forks = 'N/A';
    }
    // --- Create Badge HTML ---
    const badgeContainer = document.createElement('div');
    badgeContainer.className = 'github-stats-badge';
    // Use innerHTML for simplicity, including potential icons (requires FontAwesome or similar)
    // Ensure your theme loads FontAwesome or add it yourself if you want icons.
    badgeContainer.innerHTML = `
        <a href="${repoUrl}" target="_blank" rel="noopener">
            <!-- Optional Icon (FontAwesome example) -->
            <!-- <i class="fab fa-github"></i> -->
             <span class="repo-name">${owner}/${repo}</span>
             ${version ? `<span class="stat version"><i class="fas fa-tag"></i> ${version}</span>` : ''}
            <span class="stat stars"><i class="fas fa-star"></i> ${stars}</span>
            <span class="stat forks"><i class="fas fa-code-branch"></i> ${forks}</span>
        </a>
    `;
    // --- Inject Badge into Header ---
    const insertBeforeElement = insertBeforeSelector ? headerContainer.querySelector(insertBeforeSelector) : null;
    if (insertBeforeElement) {
        // headerContainer.insertBefore(badgeContainer, insertBeforeElement);
        headerContainer.querySelector(insertBeforeSelector).appendChild(badgeContainer); 
    } else {
        headerContainer.appendChild(badgeContainer); 
    }
     console.info('GitHub Stats: Badge added to header.');
 });
--- a/docs/md_v2/assets/layout.css
+++ b/docs/md_v2/assets/layout.css
@@ -0,0 +1,441 @@
 /* ==== File: assets/layout.css (Non-Fluid Centered Layout) ==== */
 :root {
    --header-height: 55px; /* Adjust if needed */
    --sidebar-width: 280px; /* Adjust if needed */
    --toc-width: 340px; /* As specified */
    --content-max-width: 90em; /* Max width for the centered content */
    --layout-transition-speed: 0.2s;
    --global-space: 10px;
 }
 /* --- Basic Setup --- */
 html {
    scroll-behavior: smooth;
    scroll-padding-top: calc(var(--header-height) + 15px);
    box-sizing: border-box;
 }
 *, *:before, *:after {
    box-sizing: inherit;
 }
 body {
    padding-top: 0;
    padding-bottom: 0;
    background-color: var(--background-color);
    color: var(--font-color);
    /* Prevents horizontal scrollbars during transitions */
    overflow-x: hidden;
 }
 /* --- Fixed Header --- */
 /* Full width, fixed header */
 .terminal .container:first-child { /* Assuming this targets the header container */
    position: fixed;
    top: 0;
    left: 0;
    right: 0;
    height: var(--header-height);
    background-color: var(--background-color);
    z-index: 1000;
    border-bottom: 1px solid var(--progress-bar-background);
    max-width: none; /* Override any container max-width */
    padding: 0 calc(var(--global-space) * 2);
 }
 /* --- Main Layout Container (Below Header) --- */
 /* This container just provides space for the fixed header */
 .container:has(.terminal-mkdocs-main-grid) {
    margin: 0 auto;
    padding: 0;
    padding-top: var(--header-height); /* Space for fixed header */
 }
 /* --- Flex Container: Grid holding content and toc (CENTERED) --- */
 /* THIS is the main centered block */
 .terminal-mkdocs-main-grid {
    display: flex;
    align-items: flex-start;
    /* Enforce max-width and center */
    max-width: var(--content-max-width);
    margin-left: auto;
    margin-right: auto;
    position: relative;
    /* Apply side padding within the centered block */
    padding-left: calc(var(--global-space) * 2);
    padding-right: calc(var(--global-space) * 2);
    /* Add margin-left to clear the fixed sidebar */
    margin-left: var(--sidebar-width);
 }
 /* --- 1. Fixed Left Sidebar (Viewport Relative) --- */
 #terminal-mkdocs-side-panel {
    position: fixed;
    top: var(--header-height);
    left: max(0px, calc((90vw - var(--content-max-width)) / 2)); 
    bottom: 0;
    width: var(--sidebar-width);
    background-color: var(--background-color);
    border-right: 1px solid var(--progress-bar-background);
    overflow-y: auto;
    z-index: 900;
    padding: 1em calc(var(--global-space) * 2);
    padding-bottom: 2em;
    /* transition: left var(--layout-transition-speed) ease-in-out; */
 }
 /* --- 2. Main Content Area (Within Centered Grid) --- */
 #terminal-mkdocs-main-content {
    flex-grow: 1;
    flex-shrink: 1;
    min-width: 0; /* Flexbox shrink fix */
    /* No left/right margins needed here - handled by parent grid */
    margin-left: 0;
    margin-right: 0;
    /* Internal Padding */
    padding: 1.5em 2em;
    position: relative;
    z-index: 1;
 }
 /* --- 3. Right Table of Contents (Sticky, Within Centered Grid) --- */
 #toc-sidebar {
    flex-basis: var(--toc-width);
    flex-shrink: 0;
    width: var(--toc-width);
    position: sticky; /* Sticks within the centered grid */
    top: var(--header-height);
    align-self: stretch;
    height: calc(100vh - var(--header-height));
    overflow-y: auto;
    padding: 1.5em 1em;
    font-size: 0.85em;
    border-left: 1px solid var(--progress-bar-background);
    z-index: 800;
    /* display: none; /* JS handles */
 }
 /* (ToC link styles remain the same) */
 #toc-sidebar h4 { margin-top: 0; margin-bottom: 1em; font-size: 1.1em; color: var(--secondary-color); padding-left: 0.8em; }
 #toc-sidebar ul { list-style: none; padding: 0; margin: 0; }
 #toc-sidebar ul li a { display: block; padding: 0.3em 0; color: var(--secondary-color); text-decoration: none; border-left: 3px solid transparent; padding-left: 0.8em; transition: all 0.1s ease-in-out; line-height: 1.4; word-break: break-word; }
 #toc-sidebar ul li.toc-level-3 a { padding-left: 1.8em; }
 #toc-sidebar ul li.toc-level-4 a { padding-left: 2.8em; }
 #toc-sidebar ul li a:hover { color: var(--font-color); background-color: rgba(255, 255, 255, 0.05); }
 #toc-sidebar ul li a.active { color: var(--primary-color); border-left-color: var(--primary-color); background-color: rgba(80, 255, 255, 0.08); }
 /* --- Footer Styling (Respects Centered Layout) --- */
 footer {
    background-color: var(--code-bg-color);
    color: var(--secondary-color);
    position: relative;
    z-index: 10;
    margin-top: 2em;
    /* Apply margin-left to clear the fixed sidebar */
    margin-left: var(--sidebar-width);
    /* Constrain width relative to the centered grid it follows */
    max-width: calc(var(--content-max-width) - var(--sidebar-width));
    margin-right: auto; /* Keep it left-aligned within the space next to sidebar */
    /* Use padding consistent with the grid */
    padding: 2em calc(var(--global-space) * 2);
 }
 /* Adjust footer grid if needed */
 .terminal-mkdocs-footer-grid {
    display: grid;
    grid-template-columns: 1fr auto;
    gap: 1em;
    align-items: center;
 }
 /* ==========================================================================
   RESPONSIVENESS (Adapting the Non-Fluid Layout)
   ========================================================================== */
 /* --- Medium screens: Hide ToC --- */
@media screen and (max-width: 1200px) {
    #toc-sidebar {
        display: none;
    }
    .terminal-mkdocs-main-grid {
        /* Grid adjusts automatically as ToC is removed */
        /* Ensure grid padding remains */
         padding-left: calc(var(--global-space) * 2);
         padding-right: calc(var(--global-space) * 2);
    }
    #terminal-mkdocs-main-content {
        /* Content area naturally expands */
    }
    footer {
        /* Footer still respects the left sidebar and overall max width */
        margin-left: var(--sidebar-width);
        max-width: calc(var(--content-max-width) - var(--sidebar-width));
        /* Padding remains consistent */
         padding-left: calc(var(--global-space) * 2);
         padding-right: calc(var(--global-space) * 2);
    }
 }
 /* --- Small screens: Hide left sidebar, full width content & footer --- */
@media screen and (max-width: 768px) {
    #terminal-mkdocs-side-panel {
        left: calc(-1 * var(--sidebar-width));
        z-index: 1100;
        box-shadow: 2px 0 10px rgba(0,0,0,0.3);
    }
    #terminal-mkdocs-side-panel.sidebar-visible {
        left: 0;
    }
    .terminal-mkdocs-main-grid {
        /* Grid now takes full width (minus body padding) */
        margin-left: 0; /* Override sidebar margin */
        margin-right: 0; /* Override auto margin */
        max-width: 100%; /* Allow full width */
        padding-left: var(--global-space); /* Reduce padding */
        padding-right: var(--global-space);
    }
    #terminal-mkdocs-main-content {
        padding: 1.5em 1em; /* Adjust internal padding */
    }
    footer {
        margin-left: 0; /* Full width footer */
        max-width: 100%; /* Allow full width */
        padding: 2em 1em; /* Adjust internal padding */
    }
    .terminal-mkdocs-footer-grid {
         grid-template-columns: 1fr; /* Stack footer items */
         text-align: center;
         gap: 0.5em;
    }
    /* Remember JS for toggle button & overlay */
 }
 /* ==== GitHub Stats Badge Styling ==== */
 .github-stats-badge {
    display: inline-block; /* Or flex if needed */
    margin-left: 2em; /* Adjust spacing */
    vertical-align: middle; /* Align with other header items */
    font-size: 0.9em; /* Slightly smaller font */
 }
 .github-stats-badge a {
    color: var(--secondary-color); /* Use secondary color */
    text-decoration: none;
    display: flex; /* Use flex for alignment */
    align-items: center;
    gap: 0.8em; /* Space between items */
    padding: 0.2em 0.5em;
    border: 1px solid var(--progress-bar-background); /* Subtle border */
    border-radius: 4px;
    transition: color 0.2s, background-color 0.2s;
 }
 .github-stats-badge a:hover {
    color: var(--font-color); /* Brighter color on hover */
    background-color: var(--progress-bar-background); /* Subtle background on hover */
 }
 .github-stats-badge .repo-name {
    color: var(--font-color); /* Make repo name stand out slightly */
    font-weight: 500; /* Optional bolder weight */
 }
 .github-stats-badge .stat {
    /* Styles for individual stats (version, stars, forks) */
    white-space: nowrap; /* Prevent wrapping */
 }
 .github-stats-badge .stat i {
    /* Optional: Style for FontAwesome icons */
    margin-right: 0.3em;
    color: var(--secondary-dimmed-color); /* Dimmer color for icons */
 }
 /* Adjust positioning relative to search/nav if needed */
 /* Example: If search is floated right */
 /* .terminal-nav { float: left; } */
 /* .github-stats-badge { float: left; } */
 /* #mkdocs-search-query { float: right; } */
 /* --- Responsive adjustments --- */
@media screen and (max-width: 900px) { /* Example breakpoint */
    .github-stats-badge .repo-name {
        display: none; /* Hide full repo name on smaller screens */
    }
    .github-stats-badge {
        margin-left: 1em;
    }
     .github-stats-badge a {
        gap: 0.5em;
    }
 }
@media screen and (max-width: 768px) {
    /* Further hide or simplify on mobile if needed */
     .github-stats-badge {
        display: none; /* Example: Hide completely on smallest screens */
     }
 }
 /* --- Ask AI Selection Button --- */
 .ask-ai-selection-button {
    background-color: var(--primary-dimmed-color, #09b5a5);
    color: var(--background-color, #070708);
    border: none;
    padding: 4px 8px;
    font-size: 0.8em;
    border-radius: 4px;
    cursor: pointer;
    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.3);
    transition: background-color 0.2s ease;
    white-space: nowrap;
 }
 .ask-ai-selection-button:hover {
    background-color: var(--primary-color, #50ffff);
 }
 /* ==== File: docs/assets/layout.css (Additions) ==== */
 /* ... (keep all existing layout CSS) ... */
 /* --- Copy Code Button Styling --- */
 /* Ensure the parent <pre> can contain the absolutely positioned button */
 #terminal-mkdocs-main-content pre {
    position: relative; /* Needed for absolute positioning of child */
    /* Add a little padding top/right to make space for the button */
    padding-top: 2.5em;
    padding-right: 1em; /* Ensure padding is sufficient */
 }
 .copy-code-button {
    position: absolute;
    top: 0.5em; /* Adjust spacing from top */
    left: 0.5em; /* Adjust spacing from left */
    z-index: 1; /* Sit on top of code */
    background-color: var(--progress-bar-background, #444); /* Use a background */
    color: var(--font-color, #eaeaea);
    border: 1px solid var(--secondary-color, #727578);
    padding: 3px 8px;
    font-size: 0.8em;
    font-family: var(--font-stack, monospace);
    border-radius: 4px;
    cursor: pointer;
    opacity: 0; /* Hidden by default */
    transition: opacity 0.2s ease-in-out, background-color 0.2s ease, color 0.2s ease;
    white-space: nowrap;
 }
 /* Show button on hover of the <pre> container */
 #terminal-mkdocs-main-content pre:hover .copy-code-button {
    opacity: 0.8; /* Show partially */
 }
 .copy-code-button:hover {
    opacity: 1; /* Fully visible on button hover */
    background-color: var(--secondary-color, #727578);
 }
 .copy-code-button:focus {
     opacity: 1; /* Ensure visible when focused */
     outline: 1px dashed var(--primary-color);
 }
 /* Style for "Copied!" state */
 .copy-code-button.copied {
    background-color: var(--primary-dimmed-color, #09b5a5);
    color: var(--background-color, #070708);
    border-color: var(--primary-dimmed-color, #09b5a5);
    opacity: 1; /* Ensure visible */
 }
 .copy-code-button.copied:hover {
     background-color: var(--primary-dimmed-color, #09b5a5); /* Prevent hover change */
 }
 /* ==== File: docs/assets/layout.css (Additions) ==== */
 /* ... (keep all existing layout CSS) ... */
 /* --- Floating Ask AI Button --- */
 .floating-ask-ai-button {
    position: fixed;
    bottom: 25px;
    right: 25px;
    z-index: 1050; /* Below modals, above most content */
    background-color: var(--primary-dimmed-color, #09b5a5);
    color: var(--background-color, #070708);
    border: none;
    border-radius: 50%; /* Make it circular */
    width: 60px; /* Adjust size */
    height: 60px; /* Adjust size */
    padding: 10px; /* Adjust padding */
    box-shadow: 0 4px 10px rgba(0, 0, 0, 0.4);
    cursor: pointer;
    transition: background-color 0.2s ease, transform 0.2s ease;
    display: flex;
    flex-direction: column; /* Stack icon and text */
    align-items: center;
    justify-content: center;
    text-decoration: none;
    text-align: center;
 }
 .floating-ask-ai-button svg {
    width: 24px; /* Control icon size */
    height: 24px;
 }
 .floating-ask-ai-button span {
    font-size: 0.7em;
    margin-top: 2px; /* Space between icon and text */
    display: block; /* Ensure it takes space */
     line-height: 1;
 }
 .floating-ask-ai-button:hover {
    background-color: var(--primary-color, #50ffff);
    transform: scale(1.05); /* Slight grow effect */
 }
 .floating-ask-ai-button:focus {
     outline: 2px solid var(--primary-color);
     outline-offset: 2px;
 }
 /* Optional: Hide text on smaller screens if needed */
@media screen and (max-width: 768px) {
     .floating-ask-ai-button span {
        /* display: none; */ /* Uncomment to hide text */
     }
     .floating-ask-ai-button {
        width: 55px;
        height: 55px;
        bottom: 20px;
        right: 20px;
     }
 }
--- a/docs/md_v2/assets/selection_ask_ai.js
+++ b/docs/md_v2/assets/selection_ask_ai.js
@@ -0,0 +1,109 @@
 // ==== File: docs/assets/selection_ask_ai.js ====
 document.addEventListener('DOMContentLoaded', () => {
    let askAiButton = null;
    const askAiPageUrl = '/core/ask-ai/'; // Adjust if your Ask AI page path is different
    function createAskAiButton() {
        const button = document.createElement('button');
        button.id = 'ask-ai-selection-btn';
        button.className = 'ask-ai-selection-button';
        button.textContent = 'Ask AI'; // Or use an icon
        button.style.display = 'none'; // Initially hidden
        button.style.position = 'absolute';
        button.style.zIndex = '1500'; // Ensure it's on top
        document.body.appendChild(button);
        button.addEventListener('click', handleAskAiClick);
        return button;
    }
    function getSafeSelectedText() {
        const selection = window.getSelection();
        if (!selection || selection.rangeCount === 0) {
            return null;
        }
        // Avoid selecting text within the button itself if it was somehow selected
        const container = selection.getRangeAt(0).commonAncestorContainer;
        if (askAiButton && askAiButton.contains(container)) {
             return null;
        }
        const text = selection.toString().trim();
        return text.length > 0 ? text : null;
    }
    function positionButton(event) {
         const selection = window.getSelection();
         if (!selection || selection.rangeCount === 0 || selection.isCollapsed) {
             hideButton();
             return;
         }
        const range = selection.getRangeAt(0);
        const rect = range.getBoundingClientRect();
        // Calculate position: top-right of the selection
        const scrollX = window.scrollX;
        const scrollY = window.scrollY;
        const buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 5; // 5px above
        const buttonLeft = rect.right + scrollX + 5; // 5px to the right
        askAiButton.style.top = `${buttonTop}px`;
        askAiButton.style.left = `${buttonLeft}px`;
        askAiButton.style.display = 'block'; // Show the button
    }
    function hideButton() {
        if (askAiButton) {
            askAiButton.style.display = 'none';
        }
    }
    function handleAskAiClick(event) {
        event.stopPropagation(); // Prevent mousedown from hiding button immediately
        const selectedText = getSafeSelectedText();
        if (selectedText) {
            console.log("Selected Text:", selectedText);
            // Base64 encode for URL safety (handles special chars, line breaks)
            // Use encodeURIComponent first for proper Unicode handling before btoa
            const encodedText = btoa(unescape(encodeURIComponent(selectedText)));
            const targetUrl = `${askAiPageUrl}?qq=${encodedText}`;
            console.log("Navigating to:", targetUrl);
            window.location.href = targetUrl; // Navigate to Ask AI page
        }
        hideButton(); // Hide after click
    }
    // --- Event Listeners ---
    // Show button on mouse up after selection
    document.addEventListener('mouseup', (event) => {
        // Slight delay to ensure selection is registered
        setTimeout(() => {
            const selectedText = getSafeSelectedText();
            if (selectedText) {
                if (!askAiButton) {
                    askAiButton = createAskAiButton();
                }
                // Don't position if the click was ON the button itself
                if (event.target !== askAiButton) {
                     positionButton(event);
                }
            } else {
                hideButton();
            }
        }, 10); // Small delay
    });
    // Hide button on scroll or click elsewhere
    document.addEventListener('mousedown', (event) => {
        // Hide if clicking anywhere EXCEPT the button itself
        if (askAiButton && event.target !== askAiButton) {
            hideButton();
        }
    });
    document.addEventListener('scroll', hideButton, true); // Capture scroll events
    console.log("Selection Ask AI script loaded.");
 });
--- a/docs/md_v2/assets/styles.css
+++ b/docs/md_v2/assets/styles.css
@@ -6,8 +6,8 @@
 }
 :root {
-    --global-font-size: 16px;
+    --global-font-size: 14px;
-    --global-code-font-size: 16px;
+    --global-code-font-size: 13px;
    --global-line-height: 1.5em;
    --global-space: 10px;
    --font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
@@ -50,8 +50,17 @@
    --display-h1-decoration: none;
    --display-h1-decoration: none;
    --header-height: 65px; /* Adjust based on your actual header height */
    --sidebar-width: 280px; /* Adjust based on your desired sidebar width */
    --toc-width: 240px; /* Adjust based on your desired ToC width */
    --layout-transition-speed: 0.2s; /* For potential future animations */
    --page-width : 100em; /* Adjust based on your design */
 }
 /* body {
    background-color: var(--background-color);
    color: var(--font-color);
@@ -256,4 +265,6 @@ div.badges a {
 }
 div.badges a > img {
    width: auto;
-}
+}
--- a/docs/md_v2/assets/toc.js
+++ b/docs/md_v2/assets/toc.js
@@ -0,0 +1,144 @@
 // ==== File: assets/toc.js ====
 document.addEventListener('DOMContentLoaded', () => {
    const mainContent = document.getElementById('terminal-mkdocs-main-content');
    const tocContainer = document.getElementById('toc-sidebar');
    const mainGrid = document.querySelector('.terminal-mkdocs-main-grid'); // Get the flex container
    if (!mainContent) {
        console.warn("TOC Generator: Main content area '#terminal-mkdocs-main-content' not found.");
        return;
    }
    // --- Create ToC container if it doesn't exist ---
    let tocElement = tocContainer;
    if (!tocElement) {
        if (!mainGrid) {
            console.warn("TOC Generator: Flex container '.terminal-mkdocs-main-grid' not found to append ToC.");
            return;
        }
        tocElement = document.createElement('aside');
        tocElement.id = 'toc-sidebar';
        tocElement.style.display = 'none'; // Keep hidden initially
        // Append it as the last child of the flex grid
        mainGrid.appendChild(tocElement);
        console.info("TOC Generator: Created '#toc-sidebar' element.");
    }
    // --- Find Headings (h2, h3, h4 are common for ToC) ---
    const headings = mainContent.querySelectorAll('h2, h3, h4');
    if (headings.length === 0) {
        console.info("TOC Generator: No headings found on this page. ToC not generated.");
        tocElement.style.display = 'none'; // Ensure it's hidden
        return;
    }
    // --- Generate ToC List ---
    const tocList = document.createElement('ul');
    const observerTargets = []; // Store headings for IntersectionObserver
    headings.forEach((heading, index) => {
        // Ensure heading has an ID for linking
        if (!heading.id) {
            // Create a simple slug-like ID
            heading.id = `toc-heading-${index}-${heading.textContent.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, '')}`;
        }
        const listItem = document.createElement('li');
        const link = document.createElement('a');
        link.href = `#${heading.id}`;
        link.textContent = heading.textContent;
        // Add class for styling based on heading level
        const level = parseInt(heading.tagName.substring(1), 10); // Get 2, 3, or 4
        listItem.classList.add(`toc-level-${level}`);
        listItem.appendChild(link);
        tocList.appendChild(listItem);
        observerTargets.push(heading); // Add to observer list
    });
    // --- Populate and Show ToC ---
    // Optional: Add a title
    const tocTitle = document.createElement('h4');
    tocTitle.textContent = 'On this page'; // Customize title if needed
    tocElement.innerHTML = ''; // Clear previous content if any
    tocElement.appendChild(tocTitle);
    tocElement.appendChild(tocList);
    tocElement.style.display = ''; // Show the ToC container
    console.info(`TOC Generator: Generated ToC with ${headings.length} items.`);
    // --- Scroll Spy using Intersection Observer ---
    const tocLinks = tocElement.querySelectorAll('a');
    let activeLink = null; // Keep track of the current active link
    const observerOptions = {
        // Observe changes relative to the viewport, offset by the header height
        // Negative top margin pushes the intersection trigger point down
        // Negative bottom margin ensures elements low on the screen can trigger before they exit
        rootMargin: `-${getComputedStyle(document.documentElement).getPropertyValue('--header-height').trim()} 0px -60% 0px`,
        threshold: 0 // Trigger as soon as any part enters/exits the boundary
    };
    const observerCallback = (entries) => {
        let topmostVisibleHeading = null;
        entries.forEach(entry => {
            const link = tocElement.querySelector(`a[href="#${entry.target.id}"]`);
            if (!link) return;
            // Check if the heading is intersecting (partially or fully visible within rootMargin)
            if (entry.isIntersecting) {
                 // Among visible headings, find the one closest to the top edge (within the rootMargin)
                if (!topmostVisibleHeading || entry.boundingClientRect.top < topmostVisibleHeading.boundingClientRect.top) {
                    topmostVisibleHeading = entry.target;
                 }
            }
        });
        // If we found a topmost visible heading, activate its link
        if (topmostVisibleHeading) {
            const newActiveLink = tocElement.querySelector(`a[href="#${topmostVisibleHeading.id}"]`);
            if (newActiveLink && newActiveLink !== activeLink) {
                 // Remove active class from previous link
                 if (activeLink) {
                     activeLink.classList.remove('active');
                     activeLink.parentElement.classList.remove('active-parent'); // Optional parent styling
                 }
                 // Add active class to the new link
                 newActiveLink.classList.add('active');
                 newActiveLink.parentElement.classList.add('active-parent'); // Optional parent styling
                 activeLink = newActiveLink;
                 // Optional: Scroll the ToC sidebar to keep the active link visible
                 // newActiveLink.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
            }
        }
        // If no headings are intersecting (scrolled past the last one?), maybe deactivate all
        // Or keep the last one active - depends on desired behavior. Current logic keeps last active.
    };
    const observer = new IntersectionObserver(observerCallback, observerOptions);
    // Observe all target headings
    observerTargets.forEach(heading => observer.observe(heading));
    // Initial check in case a heading is already in view on load
    // (Requires slight delay for accurate layout calculation)
    setTimeout(() => {
        observerCallback(observer.takeRecords()); // Process initial state
    }, 100);
    // move footer and the hr before footer to the end of the main content
    const footer = document.querySelector('footer');
    const hr = footer.previousElementSibling;
    if (hr && hr.tagName === 'HR') {
        mainContent.appendChild(hr);
    }
    mainContent.appendChild(footer);
    console.info("TOC Generator: Footer moved to the end of the main content.");
 });
--- a/docs/md_v2/blog/releases/0.5.0.md
+++ b/docs/md_v2/blog/releases/0.5.0.md
@@ -251,7 +251,7 @@ from crawl4ai import (
    RoundRobinProxyStrategy,
 )
 import asyncio
-from crawl4ai.proxy_strategy import ProxyConfig
+from crawl4ai import ProxyConfig
 async def main():
    # Load proxies and create rotation strategy
    proxies = ProxyConfig.from_env()
--- a/docs/md_v2/core/ask-ai.md
+++ b/docs/md_v2/core/ask-ai.md
@@ -0,0 +1,74 @@
 <div class="ask-ai-container">
 <iframe id="ask-ai-frame" src="../../ask_ai/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI Assistant"></iframe>
 </div>
 <script>
 // Iframe height adjustment
 function resizeAskAiIframe() {
  const iframe = document.getElementById('ask-ai-frame');
  if (iframe) {
    const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
    // Footer is removed by JS below, so calculate height based on header + small buffer
    const topOffset = headerHeight + 20; // Header + buffer/margin
    const availableHeight = window.innerHeight - topOffset;
    iframe.style.height = Math.max(600, availableHeight) + 'px'; // Min height 600px
  }
 }
 // Run immediately and on resize/load
 resizeAskAiIframe(); // Initial call
 let resizeTimer;
 window.addEventListener('load', resizeAskAiIframe);
 window.addEventListener('resize', () => {
    clearTimeout(resizeTimer);
    resizeTimer = setTimeout(resizeAskAiIframe, 150);
 });
 // Remove Footer & HR from parent page (DOM Ready might be safer)
 document.addEventListener('DOMContentLoaded', () => {
    setTimeout(() => { // Add slight delay just in case elements render slowly
        const footer = window.parent.document.querySelector('footer'); // Target parent document
        if (footer) {
            const hrBeforeFooter = footer.previousElementSibling;
            if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
                hrBeforeFooter.remove();
            }
            footer.remove();
            // Trigger resize again after removing footer
            resizeAskAiIframe();
        } else {
             console.warn("Ask AI Page: Could not find footer in parent document to remove.");
        }
    }, 100); // Shorter delay
 });
 </script>
 <style>
 #terminal-mkdocs-main-content {
    padding: 0 !important;
    margin: 0;
    width: 100%;
    height: 100%;
    overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
 }
 /* Ensure iframe container takes full space */
 #terminal-mkdocs-main-content .ask-ai-container {
    /* Remove negative margins if footer removal handles space */
     margin: 0;
    padding: 0;
    max-width: none;
    /* Let the JS set the height */
    /* height: 600px; Initial fallback height */
    overflow: hidden; /* Hide potential overflow before JS resize */
 }
 /* Hide title/paragraph if they were part of the markdown */
 /* Alternatively, just remove them from the .md file directly */
 /* #terminal-mkdocs-main-content > h1,
 #terminal-mkdocs-main-content > p:first-of-type {
    display: none;
 } */
 </style>
--- a/docs/md_v2/core/docker-deployment.md
+++ b/docs/md_v2/core/docker-deployment.md
--- a/docs/tutorials/coming_soon.md
+++ b/docs/tutorials/coming_soon.md
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -7,10 +7,11 @@ docs_dir: docs/md_v2
 nav:
  - Home: 'index.md'
  - "Ask AI": "core/ask-ai.md"
  - "Quick Start": "core/quickstart.md"
  - Setup & Installation:
    - "Installation": "core/installation.md"
    - "Docker Deployment": "core/docker-deployment.md"
  - "Quick Start": "core/quickstart.md"
  - "Blog & Changelog":
    - "Blog Home": "blog/index.md"
    - "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
@@ -76,6 +77,7 @@ extra:
  version: !ENV [CRAWL4AI_VERSION, 'development']
 extra_css:
  - assets/layout.css
  - assets/styles.css
  - assets/highlight.css
  - assets/dmvendor.css
@@ -83,4 +85,9 @@ extra_css:
 extra_javascript:
  - assets/highlight.min.js
  - assets/highlight_init.js
-  - https://buttons.github.io/buttons.js
+  - https://buttons.github.io/buttons.js
  - assets/toc.js
  - assets/github_stats.js 
  - assets/selection_ask_ai.js
  - assets/copy_code.js
  - assets/floating_ask_ai_button.js
--- a/parameter_updates.txt
+++ b/parameter_updates.txt
@@ -1,20 +0,0 @@
 The file /docs/md_v2/api/parameters.md should be updated to include the new network and console capturing parameters. 
 Here's what needs to be updated:
 1. Change section title from:
 ```
 ### G) **Debug & Logging**
 ```
 to:
 ```
 ### G) **Debug, Logging & Capturing**
 ```
 2. Add new parameters to the table:
 ```
 | **`capture_network_requests`** | `bool` (False) | Captures all network requests, responses, and failures during the crawl. Available in `result.network_requests`. |
 | **`capture_console_messages`** | `bool` (False) | Captures all browser console messages (logs, warnings, errors) during the crawl. Available in `result.console_messages`. |
 ```
 These changes demonstrate how to use the new network and console capturing features in the CrawlerRunConfig.
--- a/tests/docker/test_rest_api_deep_crawl.py
+++ b/tests/docker/test_rest_api_deep_crawl.py
@@ -0,0 +1,596 @@
 # ==== File: test_rest_api_deep_crawl.py ====
 import pytest
 import pytest_asyncio
 import httpx
 import json
 import asyncio
 import os
 from typing import List, Dict, Any, AsyncGenerator
 from dotenv import load_dotenv
 load_dotenv() # Load environment variables from .env file if present
 # --- Test Configuration ---
 BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP
 BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode
 DEEP_CRAWL_BASE_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
 DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # Used for domain filter
 # --- Helper Functions ---
 def load_proxies_from_env() -> List[Dict]:
    """Load proxies from PROXIES environment variable"""
    proxies = []
    proxies_str = os.getenv("PROXIES", "")
    if not proxies_str:
        print("PROXIES environment variable not set or empty.")
        return proxies
    try:
        proxy_list = proxies_str.split(",")
        for proxy in proxy_list:
            proxy = proxy.strip()
            if not proxy:
                continue
            parts = proxy.split(":")
            if len(parts) == 4:
                ip, port, username, password = parts
                proxies.append({
                    "server": f"http://{ip}:{port}", # Assuming http, adjust if needed
                    "username": username,
                    "password": password,
                    "ip": ip  # Store original IP if available
                })
            elif len(parts) == 2: # ip:port only
                 ip, port = parts
                 proxies.append({
                    "server": f"http://{ip}:{port}",
                    "ip": ip
                 })
            else:
                 print(f"Skipping invalid proxy string format: {proxy}")
    except Exception as e:
        print(f"Error loading proxies from environment: {e}")
    return proxies
 async def check_server_health(client: httpx.AsyncClient):
    """Check if the server is healthy before running tests."""
    try:
        response = await client.get("/health")
        response.raise_for_status()
        print(f"\nServer healthy: {response.json()}")
        return True
    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
 async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False):
    """Asserts the basic structure of a single crawl result."""
    assert isinstance(result, dict)
    assert "url" in result
    assert "success" in result
    assert "html" in result # Basic crawls should return HTML
    assert "metadata" in result
    assert isinstance(result["metadata"], dict)
    assert "depth" in result["metadata"] # Deep crawls add depth
    if check_ssl:
        assert "ssl_certificate" in result # Check if SSL info is present
        assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None
 async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
    """Processes an NDJSON streaming response."""
    results = []
    completed = False
    async for line in response.aiter_lines():
        if line:
            try:
                data = json.loads(line)
                if data.get("status") == "completed":
                    completed = True
                    break # Stop processing after completion marker
                elif data.get("url"): # Ensure it looks like a result object
                    results.append(data)
                else:
                    print(f"Received non-result JSON line: {data}") # Log other status messages if needed
            except json.JSONDecodeError:
                pytest.fail(f"Failed to decode JSON line: {line}")
    assert completed, "Streaming response did not end with a completion marker."
    return results
 # --- Pytest Fixtures ---
@pytest_asyncio.fixture(scope="function")
 async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
    """Provides an async HTTP client"""
    # Increased timeout for potentially longer deep crawls
    async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
        yield client
    # No explicit close needed with 'async with'
 # --- Test Class ---
@pytest.mark.asyncio
 class TestDeepCrawlEndpoints:
    @pytest_asyncio.fixture(autouse=True)
    async def check_health_before_tests(self, async_client: httpx.AsyncClient):
        """Fixture to ensure server is healthy before each test in the class."""
        await check_server_health(async_client)
    # 1. Basic Deep Crawl
    async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
        """Test BFS deep crawl with limited depth and pages."""
        max_depth = 1
        max_pages = 3 # start_url + 2 more
        payload = {
            "urls": [DEEP_CRAWL_BASE_URL],
            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "stream": False,
                    "cache_mode": "BYPASS", # Use string value for CacheMode
                    "deep_crawl_strategy": {
                        "type": "BFSDeepCrawlStrategy",
                        "params": {
                            "max_depth": max_depth,
                            "max_pages": max_pages,
                            # Minimal filters for basic test
                            "filter_chain": {
                                "type": "FilterChain",
                                "params": {
                                    "filters": [
                                        {
                                            "type": "DomainFilter",
                                            "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
                                        }
                                    ]
                                }
                            }
                        }
                    }
                }
            }
        }
        response = await async_client.post("/crawl", json=payload)
        response.raise_for_status()
        data = response.json()
        assert data["success"] is True
        assert isinstance(data["results"], list)
        assert len(data["results"]) > 1 # Should be more than just the start URL
        assert len(data["results"]) <= max_pages # Respect max_pages
        found_depth_0 = False
        found_depth_1 = False
        for result in data["results"]:
            await assert_crawl_result_structure(result)
            assert result["success"] is True
            assert DEEP_CRAWL_DOMAIN in result["url"]
            depth = result["metadata"]["depth"]
            assert depth <= max_depth
            if depth == 0: found_depth_0 = True
            if depth == 1: found_depth_1 = True
        assert found_depth_0
        assert found_depth_1
    # 2. Deep Crawl with Filtering
    async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
        """Test BFS deep crawl with content type and domain filters."""
        max_depth = 1
        max_pages = 5
        payload = {
            "urls": [DEEP_CRAWL_BASE_URL],
            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "stream": False,
                    "cache_mode": "BYPASS",
                    "deep_crawl_strategy": {
                        "type": "BFSDeepCrawlStrategy",
                        "params": {
                            "max_depth": max_depth,
                            "max_pages": max_pages,
                            "filter_chain": {
                                "type": "FilterChain",
                                "params": {
                                    "filters": [
                                        {
                                            "type": "DomainFilter",
                                            "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
                                        },
                                        {
                                            "type": "ContentTypeFilter",
                                            "params": {"allowed_types": ["text/html"]}
                                        },
                                        # Example: Exclude specific paths using regex
                                        {
                                            "type": "URLPatternFilter",
                                             "params": {
                                                 "patterns": ["*/category-3/*"], # Block category 3
                                                 "reverse": True # Block if match
                                             }
                                        }
                                    ]
                                }
                            }
                        }
                    }
                }
            }
        }
        response = await async_client.post("/crawl", json=payload)
        response.raise_for_status()
        data = response.json()
        assert data["success"] is True
        assert len(data["results"]) > 0
        assert len(data["results"]) <= max_pages
        for result in data["results"]:
            await assert_crawl_result_structure(result)
            assert result["success"] is True
            assert DEEP_CRAWL_DOMAIN in result["url"]
            assert "category-3" not in result["url"] # Check if filter worked
            assert result["metadata"]["depth"] <= max_depth
    # 3. Deep Crawl with Scoring
    async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
        """Test BFS deep crawl with URL scoring."""
        max_depth = 1
        max_pages = 4
        payload = {
            "urls": [DEEP_CRAWL_BASE_URL],
            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "stream": False,
                    "cache_mode": "BYPASS",
                    "deep_crawl_strategy": {
                        "type": "BFSDeepCrawlStrategy",
                        "params": {
                            "max_depth": max_depth,
                            "max_pages": max_pages,
                            "filter_chain": { # Keep basic domain filter
                                "type": "FilterChain",
                                "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
                            },
                            "url_scorer": { # Add scorer
                                "type": "CompositeScorer",
                                "params": {
                                    "scorers": [
                                        {   # Favor pages with 'product' in the URL
                                            "type": "KeywordRelevanceScorer",
                                            "params": {"keywords": ["product"], "weight": 1.0}
                                        },
                                        {   # Penalize deep paths slightly
                                            "type": "PathDepthScorer",
                                            "params": {"optimal_depth": 2, "weight": -0.2}
                                        }
                                    ]
                                }
                            },
                            # Set a threshold if needed: "score_threshold": 0.1
                        }
                    }
                }
            }
        }
        response = await async_client.post("/crawl", json=payload)
        response.raise_for_status()
        data = response.json()
        assert data["success"] is True
        assert len(data["results"]) > 0
        assert len(data["results"]) <= max_pages
        # Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
        product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
        print(f"Product URLs found among depth > 0 results: {product_urls_found}")
        # We expect scoring to prioritize product pages if available within limits
        # assert product_urls_found # This might be too strict depending on site structure and limits
        for result in data["results"]:
            await assert_crawl_result_structure(result)
            assert result["success"] is True
            assert result["metadata"]["depth"] <= max_depth
    # 4. Deep Crawl with CSS Extraction
    async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
        """Test BFS deep crawl combined with JsonCssExtractionStrategy."""
        max_depth = 6 # Go deep enough to reach product pages
        max_pages = 20
        # Schema to extract product details
        product_schema = {
            "name": "ProductDetails",
            "baseSelector": "div.container", # Base for product page
            "fields": [
                {"name": "product_title", "selector": "h1", "type": "text"},
                {"name": "price", "selector": ".product-price", "type": "text"},
                {"name": "description", "selector": ".product-description p", "type": "text"},
                {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
                     {"name": "spec_name", "selector": ".spec-name", "type": "text"},
                     {"name": "spec_value", "selector": ".spec-value", "type": "text"}
                ]}
            ]
        }
        payload = {
            "urls": [DEEP_CRAWL_BASE_URL],
            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "stream": False,
                    "cache_mode": "BYPASS",
                    "extraction_strategy": { # Apply extraction to ALL crawled pages
                        "type": "JsonCssExtractionStrategy",
                        "params": {"schema": {"type": "dict", "value": product_schema}}
                    },
                    "deep_crawl_strategy": {
                        "type": "BFSDeepCrawlStrategy",
                        "params": {
                            "max_depth": max_depth,
                            "max_pages": max_pages,
                            "filter_chain": { # Only crawl HTML on our domain
                                "type": "FilterChain",
                                "params": {
                                    "filters": [
                                        {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
                                        {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
                                    ]
                                }
                            }
                            # Optional: Add scoring to prioritize product pages for extraction
                        }
                    }
                }
            }
        }
        response = await async_client.post("/crawl", json=payload)
        response.raise_for_status()
        data = response.json()
        assert data["success"] is True
        assert len(data["results"]) > 0
        # assert len(data["results"]) <= max_pages
        found_extracted_product = False
        for result in data["results"]:
            await assert_crawl_result_structure(result)
            assert result["success"] is True
            assert "extracted_content" in result
            if "product_" in result["url"]: # Check product pages specifically
                 assert result["extracted_content"] is not None
                 try:
                     extracted = json.loads(result["extracted_content"])
                     # Schema returns list even if one base match
                     assert isinstance(extracted, list)
                     if extracted:
                         item = extracted[0]
                         assert "product_title" in item and item["product_title"]
                         assert "price" in item and item["price"]
                         # Specs might be empty list if not found
                         assert "specs" in item and isinstance(item["specs"], list)
                         found_extracted_product = True
                         print(f"Extracted product: {item.get('product_title')}")
                 except (json.JSONDecodeError, AssertionError, IndexError) as e:
                      pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
            # else:
            #      # Non-product pages might have None or empty list depending on schema match
            #      assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
        assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
    # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
    async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
        """Test BFS deep crawl combined with LLMExtractionStrategy."""
        max_depth = 1 # Limit depth to keep LLM calls manageable
        max_pages = 3
        payload = {
            "urls": [DEEP_CRAWL_BASE_URL],
            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "stream": False,
                    "cache_mode": "BYPASS",
                    "extraction_strategy": { # Apply LLM extraction to crawled pages
                        "type": "LLMExtractionStrategy",
                        "params": {
                            "instruction": "Extract the main H1 title and the text content of the first paragraph.",
                            "llm_config": { # Example override, rely on server default if possible
                               "type": "LLMConfig",
                               "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
                            },
                             "schema": { # Expected JSON output
                                "type": "dict",
                                "value": {
                                    "title": "PageContent", "type": "object",
                                    "properties": {
                                        "h1_title": {"type": "string"},
                                        "first_paragraph": {"type": "string"}
                                    }
                                }
                            }
                        }
                    },
                    "deep_crawl_strategy": {
                        "type": "BFSDeepCrawlStrategy",
                        "params": {
                            "max_depth": max_depth,
                            "max_pages": max_pages,
                            "filter_chain": {
                                "type": "FilterChain",
                                "params": {
                                    "filters": [
                                        {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
                                        {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
                                    ]
                                }
                            }
                        }
                    }
                }
            }
        }
        try:
            response = await async_client.post("/crawl", json=payload)
            response.raise_for_status()
            data = response.json()
        except httpx.HTTPStatusError as e:
            pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
        except httpx.RequestError as e:
             pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
        assert data["success"] is True
        assert len(data["results"]) > 0
        assert len(data["results"]) <= max_pages
        found_llm_extraction = False
        for result in data["results"]:
            await assert_crawl_result_structure(result)
            assert result["success"] is True
            assert "extracted_content" in result
            assert result["extracted_content"] is not None
            try:
                extracted = json.loads(result["extracted_content"])
                if isinstance(extracted, list): extracted = extracted[0] # Handle list output
                assert isinstance(extracted, dict)
                assert "h1_title" in extracted # Check keys based on schema
                assert "first_paragraph" in extracted
                found_llm_extraction = True
                print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
            except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
                pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
        assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
    # 6. Deep Crawl with SSL Certificate Fetching
    async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
        """Test BFS deep crawl with fetch_ssl_certificate enabled."""
        max_depth = 0 # Only fetch for start URL to keep test fast
        max_pages = 1
        payload = {
            "urls": [DEEP_CRAWL_BASE_URL],
            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "stream": False,
                    "cache_mode": "BYPASS",
                    "fetch_ssl_certificate": True, # <-- Enable SSL fetching
                    "deep_crawl_strategy": {
                        "type": "BFSDeepCrawlStrategy",
                        "params": {
                            "max_depth": max_depth,
                            "max_pages": max_pages,
                        }
                    }
                }
            }
        }
        response = await async_client.post("/crawl", json=payload)
        response.raise_for_status()
        data = response.json()
        assert data["success"] is True
        assert len(data["results"]) == 1
        result = data["results"][0]
        await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
        assert result["success"] is True
                # Check if SSL info was actually retrieved
        if result["ssl_certificate"]:
            # Assert directly using dictionary keys
            assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
            assert "issuer" in result["ssl_certificate"]
            assert "subject" in result["ssl_certificate"]
            # --- MODIFIED ASSERTIONS ---
            assert "not_before" in result["ssl_certificate"] # Check for the actual key
            assert "not_after" in result["ssl_certificate"]  # Check for the actual key
            # --- END MODIFICATIONS ---
            assert "fingerprint" in result["ssl_certificate"] # Check another key
            # This print statement using .get() already works correctly with dictionaries
            print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
            print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
        else:
            # This part remains the same
            print("SSL Certificate was null in the result.")
    # 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)
    async def test_deep_crawl_with_proxies(self, async_client: httpx.AsyncClient):
        """Test BFS deep crawl using proxy rotation."""
        proxies = load_proxies_from_env()
        if not proxies:
            pytest.skip("Skipping proxy test: PROXIES environment variable not set or empty.")
        print(f"\nTesting with {len(proxies)} proxies loaded from environment.")
        max_depth = 1
        max_pages = 3
        payload = {
            "urls": [DEEP_CRAWL_BASE_URL], # Use the dummy site
             # Use a BrowserConfig that *might* pick up proxy if set, but rely on CrawlerRunConfig
            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "stream": False,
                    "cache_mode": "BYPASS",
                    "proxy_rotation_strategy": { # <-- Define the strategy
                        "type": "RoundRobinProxyStrategy",
                        "params": {
                             # Convert ProxyConfig dicts back to the serialized format expected by server
                             "proxies": [{"type": "ProxyConfig", "params": p} for p in proxies]
                        }
                    },
                    "deep_crawl_strategy": {
                        "type": "BFSDeepCrawlStrategy",
                        "params": {
                            "max_depth": max_depth,
                            "max_pages": max_pages,
                            "filter_chain": {
                                "type": "FilterChain",
                                "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
                            }
                        }
                    }
                }
            }
        }
        try:
            response = await async_client.post("/crawl", json=payload)
            response.raise_for_status()
            data = response.json()
        except httpx.HTTPStatusError as e:
            # Proxies often cause connection errors, catch them
            pytest.fail(f"Proxy deep crawl failed: {e}. Response: {e.response.text}. Are proxies valid and accessible by the server?")
        except httpx.RequestError as e:
             pytest.fail(f"Proxy deep crawl request failed: {e}. Are proxies valid and accessible?")
        assert data["success"] is True
        assert len(data["results"]) > 0
        assert len(data["results"]) <= max_pages
        # Primary assertion is that the crawl succeeded *with* proxy config
        print(f"Proxy deep crawl completed successfully for {len(data['results'])} pages.")
        # Verifying specific proxy usage requires server logs or custom headers/responses
 # --- Main Execution Block (for running script directly) ---
 if __name__ == "__main__":
    pytest_args = ["-v", "-s", __file__]
    # Example: Run only proxy test
    # pytest_args.append("-k test_deep_crawl_with_proxies")
    print(f"Running pytest with args: {pytest_args}")
    exit_code = pytest.main(pytest_args)
    print(f"Pytest finished with exit code: {exit_code}")
--- a/tests/docker/test_server_requests.py
+++ b/tests/docker/test_server_requests.py
@@ -0,0 +1,655 @@
 import pytest
 import pytest_asyncio
 import httpx
 import json
 import asyncio
 import os
 from typing import List, Dict, Any, AsyncGenerator
 from dotenv import load_dotenv
 load_dotenv()
 # Optional: Import crawl4ai classes directly for reference/easier payload creation aid
 # You don't strictly NEED these imports for the tests to run against the server,
 # but they help in understanding the structure you are mimicking in JSON.
 from crawl4ai import (
    BrowserConfig,
    CrawlerRunConfig,
    CacheMode,
    DefaultMarkdownGenerator,
    PruningContentFilter,
    BM25ContentFilter,
    BFSDeepCrawlStrategy,
    FilterChain,
    ContentTypeFilter,
    DomainFilter,
    CompositeScorer,
    KeywordRelevanceScorer,
    PathDepthScorer,
    JsonCssExtractionStrategy,
    LLMExtractionStrategy,
    LLMConfig
 )
 # --- Test Configuration ---
 # BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
 BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Make base URL configurable
 # Use a known simple HTML page for basic tests
 SIMPLE_HTML_URL = "https://httpbin.org/html"
 # Use a site suitable for scraping tests
 SCRAPE_TARGET_URL = "http://books.toscrape.com/"
 # Use a site with internal links for deep crawl tests
 DEEP_CRAWL_URL = "https://python.org"
 # --- Pytest Fixtures ---
 # Use the built-in event_loop fixture from pytest_asyncio
 # The custom implementation was causing issues with closing the loop
@pytest_asyncio.fixture(scope="function")  # Changed to function scope to avoid event loop issues
 async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
    """Provides an async HTTP client"""
    client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0)
    yield client
    await client.aclose()
 # --- Helper Functions ---
 async def check_server_health(client: httpx.AsyncClient):
    """Check if the server is healthy before running tests."""
    try:
        response = await client.get("/health")
        response.raise_for_status()
        print(f"\nServer healthy: {response.json()}")
        return True
    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
 async def assert_crawl_result_structure(result: Dict[str, Any]):
    """Asserts the basic structure of a single crawl result."""
    assert isinstance(result, dict)
    assert "url" in result
    assert "success" in result
    assert "html" in result
    # Add more common checks if needed
 async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
    """Processes an NDJSON streaming response."""
    results = []
    completed = False
    async for line in response.aiter_lines():
        if line:
            try:
                data = json.loads(line)
                if data.get("status") == "completed":
                    completed = True
                    break # Stop processing after completion marker
                else:
                    results.append(data)
            except json.JSONDecodeError:
                pytest.fail(f"Failed to decode JSON line: {line}")
    assert completed, "Streaming response did not end with a completion marker."
    return results
 # --- Test Class ---
@pytest.mark.asyncio
 class TestCrawlEndpoints:
    @pytest_asyncio.fixture(autouse=True)
    async def check_health_before_tests(self, async_client: httpx.AsyncClient):
        """Fixture to ensure server is healthy before each test in the class."""
        await check_server_health(async_client)
    # 1. Simple Requests (Primitives)
    async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
        """Test /crawl with a single URL and simple config values."""
        payload = {
            "urls": [SIMPLE_HTML_URL],
            "browser_config": {
                "type": "BrowserConfig",
                "params": {
                    "headless": True,
                }
            },
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "stream": False, # Explicitly false for /crawl
                    "screenshot": False,
                    "cache_mode": CacheMode.BYPASS.value # Use enum value
                }
            }
        }
        try:
            response = await async_client.post("/crawl", json=payload)
            print(f"Response status: {response.status_code}")
            response.raise_for_status()
            data = response.json()
        except httpx.HTTPStatusError as e:
            print(f"Server error: {e}")
            print(f"Response content: {e.response.text}")
            raise
        assert data["success"] is True
        assert isinstance(data["results"], list)
        assert len(data["results"]) == 1
        result = data["results"][0]
        await assert_crawl_result_structure(result)
        assert result["success"] is True
        assert result["url"] == SIMPLE_HTML_URL
        assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
        # We don't specify a markdown generator in this test, so don't make assumptions about markdown field
        # It might be null, missing, or populated depending on the server's default behavior
    async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
        """Test /crawl/stream with a single URL and simple config values."""
        payload = {
            "urls": [SIMPLE_HTML_URL],
            "browser_config": {
                "type": "BrowserConfig",
                "params": {
                    "headless": True,
                }
            },
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "stream": True, # Must be true for /crawl/stream
                    "screenshot": False,
                    "cache_mode": CacheMode.BYPASS.value
                }
            }
        }
        async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
            response.raise_for_status()
            results = await process_streaming_response(response)
        assert len(results) == 1
        result = results[0]
        await assert_crawl_result_structure(result)
        assert result["success"] is True
        assert result["url"] == SIMPLE_HTML_URL
        assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
    # 2. Multi-URL and Dispatcher
    async def test_multi_url_crawl(self, async_client: httpx.AsyncClient):
        """Test /crawl with multiple URLs, implicitly testing dispatcher."""
        urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
        payload = {
            "urls": urls,
            "browser_config": {
                "type": "BrowserConfig",
                "params": {"headless": True}
            },
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {"stream": False, "cache_mode": CacheMode.BYPASS.value}
            }
        }
        try:
            print(f"Sending deep crawl request to server...")
            response = await async_client.post("/crawl", json=payload)
            print(f"Response status: {response.status_code}")
            if response.status_code >= 400:
                error_detail = response.json().get('detail', 'No detail provided')
                print(f"Error detail: {error_detail}")
                print(f"Full response: {response.text}")
            response.raise_for_status()
            data = response.json()
        except httpx.HTTPStatusError as e:
            print(f"Server error status: {e.response.status_code}")
            print(f"Server error response: {e.response.text}")
            try:
                error_json = e.response.json()
                print(f"Parsed error: {error_json}")
            except:
                print("Could not parse error response as JSON")
            raise
        assert data["success"] is True
        assert isinstance(data["results"], list)
        assert len(data["results"]) == len(urls)
        for result in data["results"]:
            await assert_crawl_result_structure(result)
            assert result["success"] is True
            assert result["url"] in urls
    async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient):
        """Test /crawl/stream with multiple URLs."""
        urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
        payload = {
            "urls": urls,
            "browser_config": {
                "type": "BrowserConfig",
                "params": {"headless": True}
            },
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {"stream": True, "cache_mode": CacheMode.BYPASS.value}
            }
        }
        async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
            response.raise_for_status()
            results = await process_streaming_response(response)
        assert len(results) == len(urls)
        processed_urls = set()
        for result in results:
            await assert_crawl_result_structure(result)
            assert result["success"] is True
            assert result["url"] in urls
            processed_urls.add(result["url"])
        assert processed_urls == set(urls) # Ensure all URLs were processed
    # 3. Class Values and Nested Classes (Markdown Generator)
    async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient):
        """Test /crawl with MarkdownGenerator using PruningContentFilter."""
        payload = {
            "urls": [SIMPLE_HTML_URL],
            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "cache_mode": CacheMode.ENABLED.value, # Test different cache mode
                    "markdown_generator": {
                        "type": "DefaultMarkdownGenerator",
                        "params": {
                            "content_filter": {
                                "type": "PruningContentFilter",
                                "params": {
                                    "threshold": 0.5, # Example param
                                    "threshold_type": "relative"
                                }
                            }
                        }
                    }
                }
            }
        }
        try:
            print(f"Sending deep crawl request to server...")
            response = await async_client.post("/crawl", json=payload)
            print(f"Response status: {response.status_code}")
            if response.status_code >= 400:
                error_detail = response.json().get('detail', 'No detail provided')
                print(f"Error detail: {error_detail}")
                print(f"Full response: {response.text}")
            response.raise_for_status()
            data = response.json()
        except httpx.HTTPStatusError as e:
            print(f"Server error status: {e.response.status_code}")
            print(f"Server error response: {e.response.text}")
            try:
                error_json = e.response.json()
                print(f"Parsed error: {error_json}")
            except:
                print("Could not parse error response as JSON")
            raise
        assert data["success"] is True
        assert len(data["results"]) == 1
        result = data["results"][0]
        await assert_crawl_result_structure(result)
        assert result["success"] is True
        assert "markdown" in result
        assert isinstance(result["markdown"], dict)
        assert "raw_markdown" in result["markdown"]
        assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown
        assert "Moby-Dick" in result["markdown"]["raw_markdown"]
        # Fit markdown content might be different/shorter due to pruning
        assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"])
    async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient):
        """Test /crawl with MarkdownGenerator using BM25ContentFilter."""
        payload = {
            "urls": [SIMPLE_HTML_URL],
            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "markdown_generator": {
                        "type": "DefaultMarkdownGenerator",
                        "params": {
                            "content_filter": {
                                "type": "BM25ContentFilter",
                                "params": {
                                    "user_query": "Herman Melville", # Query for BM25
                                    "bm25_threshold": 0.1, # Lower threshold to increase matches
                                    "language": "english"  # Valid parameters
                                }
                            }
                        }
                    }
                }
            }
        }
        try:
            print(f"Payload for BM25 test: {json.dumps(payload)}")
            response = await async_client.post("/crawl", json=payload)
            print(f"Response status: {response.status_code}")
            if response.status_code >= 400:
                error_detail = response.json().get('detail', 'No detail provided')
                print(f"Error detail: {error_detail}")
                print(f"Full response: {response.text}")
            response.raise_for_status()
            data = response.json()
        except httpx.HTTPStatusError as e:
            print(f"Server error status: {e.response.status_code}")
            print(f"Server error response: {e.response.text}")
            try:
                error_json = e.response.json()
                print(f"Parsed error: {error_json}")
            except:
                print("Could not parse error response as JSON")
            raise
        assert data["success"] is True
        assert len(data["results"]) == 1
        result = data["results"][0]
        await assert_crawl_result_structure(result)
        assert result["success"] is True
        assert "markdown" in result
        assert isinstance(result["markdown"], dict)
        assert "raw_markdown" in result["markdown"]
        assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown
        # Print values for debug
        print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}")
        print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}")
        # Either fit_markdown has content (possibly including our query terms)
        # or it might be empty if no good BM25 matches were found
        # Don't assert specific content since it can be environment-dependent
    # 4. Deep Crawling
    async def test_deep_crawl(self, async_client: httpx.AsyncClient):
        """Test /crawl with a deep crawl strategy."""
        payload = {
            "urls": [DEEP_CRAWL_URL], # Start URL
            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "stream": False,
                    "cache_mode": CacheMode.BYPASS.value,
                    "deep_crawl_strategy": {
                        "type": "BFSDeepCrawlStrategy",
                        "params": {
                            "max_depth": 1, # Limit depth for testing speed
                            "max_pages": 5, # Limit pages to crawl
                            "filter_chain": {
                                "type": "FilterChain",
                                "params": {
                                    "filters": [
                                        {
                                            "type": "ContentTypeFilter",
                                            "params": {"allowed_types": ["text/html"]}
                                        },
                                        {
                                            "type": "DomainFilter",
                                            "params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains
                                        }
                                    ]
                                }
                            },
                            "url_scorer": {
                                "type": "CompositeScorer",
                                "params": {
                                    "scorers": [
                                        {
                                            "type": "KeywordRelevanceScorer",
                                            "params": {"keywords": ["documentation", "tutorial"]}
                                        },
                                        {
                                            "type": "PathDepthScorer",
                                            "params": {"weight": 0.5, "optimal_depth": 2}
                                        }
                                    ]
                                }
                            }
                        }
                    }
                }
            }
        }
        try:
            print(f"Sending deep crawl request to server...")
            response = await async_client.post("/crawl", json=payload)
            print(f"Response status: {response.status_code}")
            if response.status_code >= 400:
                error_detail = response.json().get('detail', 'No detail provided')
                print(f"Error detail: {error_detail}")
                print(f"Full response: {response.text}")
            response.raise_for_status()
            data = response.json()
        except httpx.HTTPStatusError as e:
            print(f"Server error status: {e.response.status_code}")
            print(f"Server error response: {e.response.text}")
            try:
                error_json = e.response.json()
                print(f"Parsed error: {error_json}")
            except:
                print("Could not parse error response as JSON")
            raise
        assert data["success"] is True
        assert isinstance(data["results"], list)
        # Expect more than 1 result due to deep crawl (start URL + crawled links)
        assert len(data["results"]) > 1
        assert len(data["results"]) <= 6 # Start URL + max_links=5
        start_url_found = False
        crawled_urls_found = False
        for result in data["results"]:
            await assert_crawl_result_structure(result)
            assert result["success"] is True
            # Print URL for debugging
            print(f"Crawled URL: {result['url']}")
            # Allow URLs that contain python.org (including subdomains like docs.python.org)
            assert "python.org" in result["url"]
            if result["url"] == DEEP_CRAWL_URL:
                start_url_found = True
            else:
                crawled_urls_found = True
        assert start_url_found
        assert crawled_urls_found
    # 5. Extraction without LLM (JSON/CSS)
    async def test_json_css_extraction(self, async_client: httpx.AsyncClient):
        """Test /crawl with JsonCssExtractionStrategy."""
        payload = {
            "urls": [SCRAPE_TARGET_URL],
            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "cache_mode": CacheMode.BYPASS.value,
                    "extraction_strategy": {
                        "type": "JsonCssExtractionStrategy",
                        "params": {
                            "schema": { 
                                "type": "dict", # IMPORTANT: Wrap schema dict with type/value structure
                                "value": {
                                    "name": "BookList",
                                    "baseSelector": "ol.row li.col-xs-6", # Select each book item
                                    "fields": [
                                        {"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
                                        {"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
                                        {"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"}
                                    ]
                                }
                            }
                        }
                    }
                }
            }
        }
        try:
            print(f"Sending deep crawl request to server...")
            response = await async_client.post("/crawl", json=payload)
            print(f"Response status: {response.status_code}")
            if response.status_code >= 400:
                error_detail = response.json().get('detail', 'No detail provided')
                print(f"Error detail: {error_detail}")
                print(f"Full response: {response.text}")
            response.raise_for_status()
            data = response.json()
        except httpx.HTTPStatusError as e:
            print(f"Server error status: {e.response.status_code}")
            print(f"Server error response: {e.response.text}")
            try:
                error_json = e.response.json()
                print(f"Parsed error: {error_json}")
            except:
                print("Could not parse error response as JSON")
            raise
        assert data["success"] is True
        assert len(data["results"]) == 1
        result = data["results"][0]
        await assert_crawl_result_structure(result)
        assert result["success"] is True
        assert "extracted_content" in result
        assert result["extracted_content"] is not None
        # Extracted content should be a JSON string representing a list of dicts
        try:
            extracted_data = json.loads(result["extracted_content"])
            assert isinstance(extracted_data, list)
            assert len(extracted_data) > 0 # Should find some books
            # Check structure of the first extracted item
            first_item = extracted_data[0]
            assert "title" in first_item
            assert "price" in first_item
            assert "rating" in first_item
            assert "star-rating" in first_item["rating"] # e.g., "star-rating Three"
        except (json.JSONDecodeError, AssertionError) as e:
            pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
    # 6. Extraction with LLM
    async def test_llm_extraction(self, async_client: httpx.AsyncClient):
        """
        Test /crawl with LLMExtractionStrategy.
        NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY)
              configured via .llm.env or environment variables.
              This test uses the default provider configured in the server's config.yml.
        """
        payload = {
            "urls": [SIMPLE_HTML_URL],
            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
            "crawler_config": {
                "type": "CrawlerRunConfig",
                "params": {
                    "cache_mode": CacheMode.BYPASS.value,
                    "extraction_strategy": {
                        "type": "LLMExtractionStrategy",
                        "params": {
                            "instruction": "Extract the main title and the author mentioned in the text into JSON.",
                            # LLMConfig is implicitly defined by server's config.yml and .llm.env
                            # If you needed to override provider/token PER REQUEST:
                            "llm_config": {
                               "type": "LLMConfig",
                               "params": {
                                  "provider": "openai/gpt-4o", # Example override
                                  "api_token": os.getenv("OPENAI_API_KEY") # Example override
                               }
                            },
                            "schema": { # Optional: Provide a schema for structured output
                                "type": "dict", # IMPORTANT: Wrap schema dict
                                "value": {
                                    "title": "Book Info",
                                    "type": "object",
                                    "properties": {
                                        "title": {"type": "string", "description": "The main title of the work"},
                                        "author": {"type": "string", "description": "The author of the work"}
                                    },
                                     "required": ["title", "author"]
                                }
                            }
                        }
                    }
                }
            }
        }
        try:
            response = await async_client.post("/crawl", json=payload)
            response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key)
            data = response.json()
        except httpx.HTTPStatusError as e:
            # Catch potential server errors (like 500 due to missing/invalid API keys)
            pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.")
        except httpx.RequestError as e:
             pytest.fail(f"LLM extraction request failed: {e}.")
        assert data["success"] is True
        assert len(data["results"]) == 1
        result = data["results"][0]
        await assert_crawl_result_structure(result)
        assert result["success"] is True
        assert "extracted_content" in result
        assert result["extracted_content"] is not None
        # Extracted content should be JSON (because we provided a schema)
        try:
            extracted_data = json.loads(result["extracted_content"])
            print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification
            # Handle both dict and list formats (server returns a list)
            if isinstance(extracted_data, list):
                assert len(extracted_data) > 0
                extracted_item = extracted_data[0]  # Take first item
                assert isinstance(extracted_item, dict)
                assert "title" in extracted_item
                assert "author" in extracted_item
                assert "Moby-Dick" in extracted_item.get("title", "")
                assert "Herman Melville" in extracted_item.get("author", "")
            else:
                assert isinstance(extracted_data, dict)
                assert "title" in extracted_data
                assert "author" in extracted_data
                assert "Moby-Dick" in extracted_data.get("title", "")
                assert "Herman Melville" in extracted_data.get("author", "")
        except (json.JSONDecodeError, AssertionError) as e:
            pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
        except Exception as e: # Catch any other unexpected error
            pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
 if __name__ == "__main__":
    # Define arguments for pytest programmatically
    # -v: verbose output
    # -s: show print statements immediately (useful for debugging)
    # __file__: tells pytest to run tests in the current file
    pytest_args = ["-v", "-s", __file__]
    # You can add more pytest arguments here if needed, for example:
    # '-k test_llm_extraction': Run only the LLM test function
    # pytest_args.append("-k test_llm_extraction")
    print(f"Running pytest with args: {pytest_args}")
    # Execute pytest
    exit_code = pytest.main(pytest_args)
    print(f"Pytest finished with exit code: {exit_code}")
--- a/tests/general/generate_dummy_site.py
+++ b/tests/general/generate_dummy_site.py
@@ -0,0 +1,335 @@
 # ==== File: build_dummy_site.py ====
 import os
 import random
 import argparse
 from pathlib import Path
 from urllib.parse import quote
 # --- Configuration ---
 NUM_CATEGORIES = 3
 NUM_SUBCATEGORIES_PER_CAT = 2 # Results in NUM_CATEGORIES * NUM_SUBCATEGORIES_PER_CAT total L2 categories
 NUM_PRODUCTS_PER_SUBCAT = 5 # Products listed on L3 pages
 MAX_DEPTH_TARGET = 5 # Explicitly set target depth
 # --- Helper Functions ---
 def generate_lorem(words=20):
    """Generates simple placeholder text."""
    lorem_words = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur",
                   "adipiscing", "elit", "sed", "do", "eiusmod", "tempor",
                   "incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]
    return " ".join(random.choice(lorem_words) for _ in range(words)).capitalize() + "."
 def create_html_page(filepath: Path, title: str, body_content: str, breadcrumbs: list = [], head_extras: str = ""):
    """Creates an HTML file with basic structure and inline CSS."""
    os.makedirs(filepath.parent, exist_ok=True)
    # Generate breadcrumb HTML using the 'link' provided in the breadcrumbs list
    breadcrumb_html = ""
    if breadcrumbs:
        links_html = " » ".join(f'<a href="{bc["link"]}">{bc["name"]}</a>' for bc in breadcrumbs)
        breadcrumb_html = f"<nav class='breadcrumbs'>{links_html} » {title}</nav>"
    # Basic CSS for structure identification (kept the same)
    css = """
 <style>
  body {
    font-family: sans-serif;
    padding: 20px;
    background-color: #1e1e1e;
    color: #d1d1d1;
  }
  .container {
    max-width: 960px;
    margin: auto;
    background: #2c2c2c;
    padding: 20px;
    border-radius: 5px;
    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.5);
  }
  h1, h2 {
    color: #ccc;
  }
  a {
    color: #9bcdff;
    text-decoration: none;
  }
  a:hover {
    text-decoration: underline;
  }
  ul {
    list-style: none;
    padding-left: 0;
  }
  li {
    margin-bottom: 10px;
  }
  .category-link,
  .subcategory-link,
  .product-link,
  .details-link,
  .reviews-link {
    display: block;
    padding: 8px;
    background-color: #3a3a3a;
    border-radius: 3px;
  }
  .product-preview {
    border: 1px solid #444;
    padding: 10px;
    margin-bottom: 10px;
    border-radius: 4px;
    background-color: #2a2a2a;
  }
  .product-title {
    color: #d1d1d1;
  }
  .product-price {
    font-weight: bold;
    color: #85e085;
  }
  .product-description,
  .product-specs,
  .product-reviews {
    margin-top: 15px;
    line-height: 1.6;
  }
  .product-specs li {
    margin-bottom: 5px;
    font-size: 0.9em;
  }
  .spec-name {
    font-weight: bold;
  }
  .breadcrumbs {
    margin-bottom: 20px;
    font-size: 0.9em;
    color: #888;
  }
  .breadcrumbs a {
    color: #9bcdff;
  }
 </style>
    """
    html_content = f"""<!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{title} - FakeShop</title>
    {head_extras}
    {css}
 </head>
 <body>
    <div class="container">
        {breadcrumb_html}
        <h1>{title}</h1>
        {body_content}
    </div>
 </body>
 </html>"""
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(html_content)
    # Keep print statement concise for clarity
    # print(f"Created: {filepath}")
 def generate_site(base_dir: Path, site_name: str = "FakeShop", base_path: str = ""):
    """Generates the dummy website structure."""
    base_dir.mkdir(parents=True, exist_ok=True)
    # --- Clean and prepare the base path for URL construction ---
    # Ensure it starts with '/' if not empty, and remove any trailing '/'
    if base_path:
        full_base_path = "/" + base_path.strip('/')
    else:
        full_base_path = "" # Represents the root
    print(f"Using base path for links: '{full_base_path}'")
    # --- Level 0: Homepage ---
    home_body = "<h2>Welcome to FakeShop!</h2><p>Your one-stop shop for imaginary items.</p><h3>Categories:</h3>\n<ul>"
    # Define the *actual* link path for the homepage breadcrumb
    home_link_path = f"{full_base_path}/index.html"
    breadcrumbs_home = [{"name": "Home", "link": home_link_path}] # Base breadcrumb
    # Links *within* the page content should remain relative
    for i in range(NUM_CATEGORIES):
        cat_name = f"Category-{i+1}"
        cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
        # This path is relative to the current directory (index.html)
        cat_relative_page_path = f"{cat_folder_name}/index.html"
        home_body += f'<li><a class="category-link" href="{cat_relative_page_path}">{cat_name}</a> - {generate_lorem(10)}</li>'
    home_body += "</ul>"
    create_html_page(base_dir / "index.html", "Homepage", home_body, []) # No breadcrumbs *on* the homepage itself
    # --- Levels 1-5 ---
    for i in range(NUM_CATEGORIES):
        cat_name = f"Category-{i+1}"
        cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
        cat_dir = base_dir / cat_folder_name
        # This is the *absolute* path for the breadcrumb link
        cat_link_path = f"{full_base_path}/{cat_folder_name}/index.html"
        # Update breadcrumbs list for this level
        breadcrumbs_cat = breadcrumbs_home + [{"name": cat_name, "link": cat_link_path}]
        # --- Level 1: Category Page ---
        cat_body = f"<p>{generate_lorem(15)} for {cat_name}.</p><h3>Sub-Categories:</h3>\n<ul>"
        for j in range(NUM_SUBCATEGORIES_PER_CAT):
            subcat_name = f"{cat_name}-Sub-{j+1}"
            subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
            # Path relative to the category page
            subcat_relative_page_path = f"{subcat_folder_name}/index.html"
            cat_body += f'<li><a class="subcategory-link" href="{subcat_relative_page_path}">{subcat_name}</a> - {generate_lorem(8)}</li>'
        cat_body += "</ul>"
        # Pass the updated breadcrumbs list
        create_html_page(cat_dir / "index.html", cat_name, cat_body, breadcrumbs_home) # Parent breadcrumb needed here
        for j in range(NUM_SUBCATEGORIES_PER_CAT):
            subcat_name = f"{cat_name}-Sub-{j+1}"
            subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
            subcat_dir = cat_dir / subcat_folder_name
            # Absolute path for the breadcrumb link
            subcat_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/index.html"
            # Update breadcrumbs list for this level
            breadcrumbs_subcat = breadcrumbs_cat + [{"name": subcat_name, "link": subcat_link_path}]
            # --- Level 2: Sub-Category Page (Product List) ---
            subcat_body = f"<p>Explore products in {subcat_name}. {generate_lorem(12)}</p><h3>Products:</h3>\n<ul class='product-list'>"
            for k in range(NUM_PRODUCTS_PER_SUBCAT):
                prod_id = f"P{i+1}{j+1}{k+1:03d}" # e.g., P11001
                prod_name = f"{subcat_name} Product {k+1} ({prod_id})"
                # Filename relative to the subcategory page
                prod_filename = f"product_{prod_id}.html"
                # Absolute path for the breadcrumb link
                prod_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{prod_filename}"
                # Preview on list page (link remains relative)
                subcat_body += f"""
                <li>
                    <div class="product-preview">
                        <a class="product-link" href="{prod_filename}"><strong>{prod_name}</strong></a>
                        <p>{generate_lorem(10)}</p>
                        <span class="product-price">£{random.uniform(10, 500):.2f}</span>
                    </div>
                </li>"""
                # --- Level 3: Product Page ---
                prod_price = random.uniform(10, 500)
                prod_desc = generate_lorem(40)
                prod_specs = {f"Spec {s+1}": generate_lorem(3) for s in range(random.randint(3,6))}
                prod_reviews_count = random.randint(0, 150)
                # Relative filenames for links on this page
                details_filename_relative = f"product_{prod_id}_details.html"
                reviews_filename_relative = f"product_{prod_id}_reviews.html"
                prod_body = f"""
                <p class="product-price">Price: £{prod_price:.2f}</p>
                <div class="product-description">
                    <h2>Description</h2>
                    <p>{prod_desc}</p>
                </div>
                <div class="product-specs">
                    <h2>Specifications</h2>
                    <ul>
                        {''.join(f'<li><span class="spec-name">{name}</span>: <span class="spec-value">{value}</span></li>' for name, value in prod_specs.items())}
                    </ul>
                </div>
                <div class="product-reviews">
                    <h2>Reviews</h2>
                    <p>Total Reviews: <span class="review-count">{prod_reviews_count}</span></p>
                </div>
                <hr>
                <p>
                    <a class="details-link" href="{details_filename_relative}">View More Details</a> |
                    <a class="reviews-link" href="{reviews_filename_relative}">See All Reviews</a>
                </p>
                """
                # Update breadcrumbs list for this level
                breadcrumbs_prod = breadcrumbs_subcat + [{"name": prod_name, "link": prod_link_path}]
                # Pass the updated breadcrumbs list
                create_html_page(subcat_dir / prod_filename, prod_name, prod_body, breadcrumbs_subcat) # Parent breadcrumb needed here
                # --- Level 4: Product Details Page ---
                details_filename = f"product_{prod_id}_details.html" # Actual filename
                # Absolute path for the breadcrumb link
                details_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{details_filename}"
                details_body = f"<p>This page contains extremely detailed information about {prod_name}.</p>{generate_lorem(100)}"
                # Update breadcrumbs list for this level
                breadcrumbs_details = breadcrumbs_prod + [{"name": "Details", "link": details_link_path}]
                # Pass the updated breadcrumbs list
                create_html_page(subcat_dir / details_filename, f"{prod_name} - Details", details_body, breadcrumbs_prod) # Parent breadcrumb needed here
                # --- Level 5: Product Reviews Page ---
                reviews_filename = f"product_{prod_id}_reviews.html" # Actual filename
                # Absolute path for the breadcrumb link
                reviews_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{reviews_filename}"
                reviews_body = f"<p>All {prod_reviews_count} reviews for {prod_name} are listed here.</p><ul>"
                for r in range(prod_reviews_count):
                     reviews_body += f"<li>Review {r+1}: {generate_lorem(random.randint(15, 50))}</li>"
                reviews_body += "</ul>"
                # Update breadcrumbs list for this level
                breadcrumbs_reviews = breadcrumbs_prod + [{"name": "Reviews", "link": reviews_link_path}]
                # Pass the updated breadcrumbs list
                create_html_page(subcat_dir / reviews_filename, f"{prod_name} - Reviews", reviews_body, breadcrumbs_prod) # Parent breadcrumb needed here
            subcat_body += "</ul>" # Close product-list ul
            # Pass the correct breadcrumbs list for the subcategory index page
            create_html_page(subcat_dir / "index.html", subcat_name, subcat_body, breadcrumbs_cat) # Parent breadcrumb needed here
 # --- Main Execution ---
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Generate a dummy multi-level retail website.")
    parser.add_argument(
        "-o", "--output-dir",
        type=str,
        default="dummy_retail_site",
        help="Directory to generate the website in."
    )
    parser.add_argument(
        "-n", "--site-name",
        type=str,
        default="FakeShop",
        help="Name of the fake shop."
    )
    parser.add_argument(
        "-b", "--base-path",
        type=str,
        default="",
        help="Base path for hosting the site (e.g., 'samples/deepcrawl'). Leave empty if hosted at the root."
    )
    # Optional: Add more args to configure counts if needed
    args = parser.parse_args()
    output_directory = Path(args.output_dir)
    site_name = args.site_name
    base_path = args.base_path
    print(f"Generating dummy site '{site_name}' in '{output_directory}'...")
    # Pass the base_path to the generation function
    generate_site(output_directory, site_name, base_path)
    print(f"\nCreated {sum(1 for _ in output_directory.rglob('*.html'))} HTML pages.")
    print("Dummy site generation complete.")
    print(f"To serve locally (example): python -m http.server --directory {output_directory} 8000")
    if base_path:
        print(f"Access the site at: http://localhost:8000/{base_path.strip('/')}/index.html")
    else:
         print(f"Access the site at: http://localhost:8000/index.html")