diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6ef49dd3..fea79456 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,30 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+### [Feature] 2025-04-21
+- Implemented MCP protocol for machine-to-machine communication
+  - Added WebSocket and SSE transport for MCP server
+  - Exposed server endpoints via MCP protocol
+  - Created tests for MCP socket and SSE communication
+- Enhanced Docker server with file handling and intelligent search
+  - Added PDF and screenshot endpoints with file saving capability
+  - Added JavaScript execution endpoint for page interaction
+  - Implemented advanced context search with BM25 and code chunking
+  - Added file path output support for generated assets
+- Improved server endpoints and API surface
+  - Added intelligent context search with query filtering
+  - Added syntax-aware code function chunking
+  - Implemented efficient HTML processing pipeline
+
+### [Refactor] 2025-04-20
+- Replaced crawler_manager.py with simpler crawler_pool.py implementation
+- Added global page semaphore for hard concurrency cap
+- Implemented browser pool with idle cleanup
+- Added playground UI for testing and stress testing
+- Updated API handlers to use pooled crawlers
+- Enhanced logging levels and symbols
+- Added memory tests and stress test utilities
+
 ### [Added] 2025-04-17
 - Added content source selection feature for markdown generation
   - New `content_source` parameter allows choosing between `cleaned_html`, `raw_html`, and `fit_html`
diff --git a/deploy/docker/c4ai-code-context.md b/deploy/docker/c4ai-code-context.md
new file mode 100644
index 00000000..f2551c01
--- /dev/null
+++ b/deploy/docker/c4ai-code-context.md
@@ -0,0 +1,11631 @@
+# Crawl4AI Code Context
+
+Generated on 2025-04-21
+
+## File: crawl4ai/async_configs.py
+
+```py
+import os
+from .config import (
+    DEFAULT_PROVIDER,
+    DEFAULT_PROVIDER_API_KEY,
+    MIN_WORD_THRESHOLD,
+    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+    PROVIDER_MODELS,
+    PROVIDER_MODELS_PREFIXES,
+    SCREENSHOT_HEIGHT_TRESHOLD,
+    PAGE_TIMEOUT,
+    IMAGE_SCORE_THRESHOLD,
+    SOCIAL_MEDIA_DOMAINS,
+)
+
+from .user_agent_generator import UAGen, ValidUAGenerator  # , OnlineUAGenerator
+from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
+from .chunking_strategy import ChunkingStrategy, RegexChunking
+
+from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
+from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
+from .deep_crawling import DeepCrawlStrategy
+
+from .cache_context import CacheMode
+from .proxy_strategy import ProxyRotationStrategy
+
+from typing import Union, List
+import inspect
+from typing import Any, Dict, Optional
+from enum import Enum
+
+# from .proxy_strategy import ProxyConfig
+
+
+
+def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
+    """
+    Recursively convert an object to a serializable dictionary using {type, params} structure
+    for complex objects.
+    """
+    if obj is None:
+        return None
+
+    # Handle basic types
+    if isinstance(obj, (str, int, float, bool)):
+        return obj
+
+    # Handle Enum
+    if isinstance(obj, Enum):
+        return {"type": obj.__class__.__name__, "params": obj.value}
+
+    # Handle datetime objects
+    if hasattr(obj, "isoformat"):
+        return obj.isoformat()
+
+    # Handle lists, tuples, and sets, and basically any iterable
+    if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__') and not isinstance(obj, dict):
+        return [to_serializable_dict(item) for item in obj]
+
+    # Handle frozensets, which are not iterable
+    if isinstance(obj, frozenset):
+        return [to_serializable_dict(item) for item in list(obj)]
+
+    # Handle dictionaries - preserve them as-is
+    if isinstance(obj, dict):
+        return {
+            "type": "dict",  # Mark as plain dictionary
+            "value": {str(k): to_serializable_dict(v) for k, v in obj.items()},
+        }
+
+    _type = obj.__class__.__name__
+
+    # Handle class instances
+    if hasattr(obj, "__class__"):
+        # Get constructor signature
+        sig = inspect.signature(obj.__class__.__init__)
+        params = sig.parameters
+
+        # Get current values
+        current_values = {}
+        for name, param in params.items():
+            if name == "self":
+                continue
+
+            value = getattr(obj, name, param.default)
+
+            # Only include if different from default, considering empty values
+            if not (is_empty_value(value) and is_empty_value(param.default)):
+                if value != param.default and not ignore_default_value:
+                    current_values[name] = to_serializable_dict(value)
+        
+        if hasattr(obj, '__slots__'):
+            for slot in obj.__slots__:
+                if slot.startswith('_'):  # Handle private slots
+                    attr_name = slot[1:]  # Remove leading '_'
+                    value = getattr(obj, slot, None)
+                    if value is not None:
+                        current_values[attr_name] = to_serializable_dict(value)
+
+            
+        
+        return {
+            "type": obj.__class__.__name__,
+            "params": current_values
+        }
+        
+    return str(obj)
+
+
+def from_serializable_dict(data: Any) -> Any:
+    """
+    Recursively convert a serializable dictionary back to an object instance.
+    """
+    if data is None:
+        return None
+
+    # Handle basic types
+    if isinstance(data, (str, int, float, bool)):
+        return data
+
+    # Handle typed data
+    if isinstance(data, dict) and "type" in data:
+        # Handle plain dictionaries
+        if data["type"] == "dict" and "value" in data:
+            return {k: from_serializable_dict(v) for k, v in data["value"].items()}
+
+        # Import from crawl4ai for class instances
+        import crawl4ai
+
+        if hasattr(crawl4ai, data["type"]):
+            cls = getattr(crawl4ai, data["type"])
+
+            # Handle Enum
+            if issubclass(cls, Enum):
+                return cls(data["params"])
+
+            if "params" in data:
+                # Handle class instances
+                constructor_args = {
+                    k: from_serializable_dict(v) for k, v in data["params"].items()
+                }
+                return cls(**constructor_args)
+
+    # Handle lists
+    if isinstance(data, list):
+        return [from_serializable_dict(item) for item in data]
+
+    # Handle raw dictionaries (legacy support)
+    if isinstance(data, dict):
+        return {k: from_serializable_dict(v) for k, v in data.items()}
+
+    return data
+
+
+def is_empty_value(value: Any) -> bool:
+    """Check if a value is effectively empty/null."""
+    if value is None:
+        return True
+    if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0:
+        return True
+    return False
+
+class ProxyConfig:
+    def __init__(
+        self,
+        server: str,
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+        ip: Optional[str] = None,
+    ):
+        """Configuration class for a single proxy.
+        
+        Args:
+            server: Proxy server URL (e.g., "http://127.0.0.1:8080")
+            username: Optional username for proxy authentication
+            password: Optional password for proxy authentication
+            ip: Optional IP address for verification purposes
+        """
+        self.server = server
+        self.username = username
+        self.password = password
+        
+        # Extract IP from server if not explicitly provided
+        self.ip = ip or self._extract_ip_from_server()
+    
+    def _extract_ip_from_server(self) -> Optional[str]:
+        """Extract IP address from server URL."""
+        try:
+            # Simple extraction assuming http://ip:port format
+            if "://" in self.server:
+                parts = self.server.split("://")[1].split(":")
+                return parts[0]
+            else:
+                parts = self.server.split(":")
+                return parts[0]
+        except Exception:
+            return None
+    
+    @staticmethod
+    def from_string(proxy_str: str) -> "ProxyConfig":
+        """Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
+        parts = proxy_str.split(":")
+        if len(parts) == 4:  # ip:port:username:password
+            ip, port, username, password = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                username=username,
+                password=password,
+                ip=ip
+            )
+        elif len(parts) == 2:  # ip:port only
+            ip, port = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                ip=ip
+            )
+        else:
+            raise ValueError(f"Invalid proxy string format: {proxy_str}")
+    
+    @staticmethod
+    def from_dict(proxy_dict: Dict) -> "ProxyConfig":
+        """Create a ProxyConfig from a dictionary."""
+        return ProxyConfig(
+            server=proxy_dict.get("server"),
+            username=proxy_dict.get("username"),
+            password=proxy_dict.get("password"),
+            ip=proxy_dict.get("ip")
+        )
+    
+    @staticmethod
+    def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
+        """Load proxies from environment variable.
+        
+        Args:
+            env_var: Name of environment variable containing comma-separated proxy strings
+            
+        Returns:
+            List of ProxyConfig objects
+        """
+        proxies = []
+        try:
+            proxy_list = os.getenv(env_var, "").split(",")
+            for proxy in proxy_list:
+                if not proxy:
+                    continue
+                proxies.append(ProxyConfig.from_string(proxy))
+        except Exception as e:
+            print(f"Error loading proxies from environment: {e}")
+        return proxies
+    
+    def to_dict(self) -> Dict:
+        """Convert to dictionary representation."""
+        return {
+            "server": self.server,
+            "username": self.username,
+            "password": self.password,
+            "ip": self.ip
+        }
+    
+    def clone(self, **kwargs) -> "ProxyConfig":
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            ProxyConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return ProxyConfig.from_dict(config_dict)
+
+
+
+class BrowserConfig:
+    """
+    Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
+
+    This class centralizes all parameters that affect browser and context creation. Instead of passing
+    scattered keyword arguments, users can instantiate and modify this configuration object. The crawler
+    code will then reference these settings to initialize the browser in a consistent, documented manner.
+
+    Attributes:
+        browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                            Default: "chromium".
+        headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                         Default: True.
+        browser_mode (str): Determines how the browser should be initialized:
+                           "builtin" - use the builtin CDP browser running in background
+                           "dedicated" - create a new dedicated browser instance each time
+                           "cdp" - use explicit CDP settings provided in cdp_url
+                           "docker" - run browser in Docker container with isolation
+                           Default: "dedicated"
+        use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
+                                    advanced manipulation. Default: False.
+        cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/".
+        debugging_port (int): Port for the browser debugging protocol. Default: 9222.
+        use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
+                                       Automatically sets use_managed_browser=True. Default: False.
+        user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                     temporary directory may be used. Default: None.
+        chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
+                              is "chromium". Default: "chromium".
+        channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
+                              is "chromium". Default: "chromium".
+        proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
+                             Default: None.
+        proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
+                                     If None, no additional proxy config. Default: None.
+        viewport_width (int): Default viewport width for pages. Default: 1080.
+        viewport_height (int): Default viewport height for pages. Default: 600.
+        viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
+                         Default: None.
+        verbose (bool): Enable verbose logging.
+                        Default: True.
+        accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
+                                 Default: False.
+        downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
+                                      a default path will be created. Default: None.
+        storage_state (str or dict or None): An in-memory storage state (cookies, localStorage).
+                                             Default: None.
+        ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
+        java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
+        cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like
+                        {"name": "...", "value": "...", "url": "..."}.
+                        Default: [].
+        headers (dict): Extra HTTP headers to apply to all requests in this context.
+                        Default: {}.
+        user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                           "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36".
+        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
+                                       user_agent as-is. Default: None.
+        user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
+                                                    Default: None.
+        text_mode (bool): If True, disables images and other rich content for potentially faster load times.
+                          Default: False.
+        light_mode (bool): Disables certain background features for performance gains. Default: False.
+        extra_args (list): Additional command-line arguments passed to the browser.
+                           Default: [].
+    """
+
+    def __init__(
+        self,
+        browser_type: str = "chromium",
+        headless: bool = True,
+        browser_mode: str = "dedicated",
+        use_managed_browser: bool = False,
+        cdp_url: str = None,
+        use_persistent_context: bool = False,
+        user_data_dir: str = None,
+        chrome_channel: str = "chromium",
+        channel: str = "chromium",
+        proxy: str = None,
+        proxy_config: Union[ProxyConfig, dict, None] = None,
+        viewport_width: int = 1080,
+        viewport_height: int = 600,
+        viewport: dict = None,
+        accept_downloads: bool = False,
+        downloads_path: str = None,
+        storage_state: Union[str, dict, None] = None,
+        ignore_https_errors: bool = True,
+        java_script_enabled: bool = True,
+        sleep_on_close: bool = False,
+        verbose: bool = True,
+        cookies: list = None,
+        headers: dict = None,
+        user_agent: str = (
+            # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
+            # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+            # "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36"
+        ),
+        user_agent_mode: str = "",
+        user_agent_generator_config: dict = {},
+        text_mode: bool = False,
+        light_mode: bool = False,
+        extra_args: list = None,
+        debugging_port: int = 9222,
+        host: str = "localhost",
+    ):
+        self.browser_type = browser_type
+        self.headless = headless or True
+        self.browser_mode = browser_mode
+        self.use_managed_browser = use_managed_browser
+        self.cdp_url = cdp_url
+        self.use_persistent_context = use_persistent_context
+        self.user_data_dir = user_data_dir
+        self.chrome_channel = chrome_channel or self.browser_type or "chromium"
+        self.channel = channel or self.browser_type or "chromium"
+        if self.browser_type in ["firefox", "webkit"]:
+            self.channel = ""
+            self.chrome_channel = ""
+        self.proxy = proxy
+        self.proxy_config = proxy_config
+
+
+        self.viewport_width = viewport_width
+        self.viewport_height = viewport_height
+        self.viewport = viewport
+        if self.viewport is not None:
+            self.viewport_width = self.viewport.get("width", 1080)
+            self.viewport_height = self.viewport.get("height", 600)
+        self.accept_downloads = accept_downloads
+        self.downloads_path = downloads_path
+        self.storage_state = storage_state
+        self.ignore_https_errors = ignore_https_errors
+        self.java_script_enabled = java_script_enabled
+        self.cookies = cookies if cookies is not None else []
+        self.headers = headers if headers is not None else {}
+        self.user_agent = user_agent
+        self.user_agent_mode = user_agent_mode
+        self.user_agent_generator_config = user_agent_generator_config
+        self.text_mode = text_mode
+        self.light_mode = light_mode
+        self.extra_args = extra_args if extra_args is not None else []
+        self.sleep_on_close = sleep_on_close
+        self.verbose = verbose
+        self.debugging_port = debugging_port
+        self.host = host
+
+        fa_user_agenr_generator = ValidUAGenerator()
+        if self.user_agent_mode == "random":
+            self.user_agent = fa_user_agenr_generator.generate(
+                **(self.user_agent_generator_config or {})
+            )
+        else:
+            pass
+
+        self.browser_hint = UAGen.generate_client_hints(self.user_agent)
+        self.headers.setdefault("sec-ch-ua", self.browser_hint)
+
+        # Set appropriate browser management flags based on browser_mode
+        if self.browser_mode == "builtin":
+            # Builtin mode uses managed browser connecting to builtin CDP endpoint
+            self.use_managed_browser = True
+            # cdp_url will be set later by browser_manager
+        elif self.browser_mode == "docker":
+            # Docker mode uses managed browser with CDP to connect to browser in container
+            self.use_managed_browser = True
+            # cdp_url will be set later by docker browser strategy
+        elif self.browser_mode == "custom" and self.cdp_url:
+            # Custom mode with explicit CDP URL
+            self.use_managed_browser = True
+        elif self.browser_mode == "dedicated":
+            # Dedicated mode uses a new browser instance each time
+            pass
+
+        # If persistent context is requested, ensure managed browser is enabled
+        if self.use_persistent_context:
+            self.use_managed_browser = True
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "BrowserConfig":
+        return BrowserConfig(
+            browser_type=kwargs.get("browser_type", "chromium"),
+            headless=kwargs.get("headless", True),
+            browser_mode=kwargs.get("browser_mode", "dedicated"),
+            use_managed_browser=kwargs.get("use_managed_browser", False),
+            cdp_url=kwargs.get("cdp_url"),
+            use_persistent_context=kwargs.get("use_persistent_context", False),
+            user_data_dir=kwargs.get("user_data_dir"),
+            chrome_channel=kwargs.get("chrome_channel", "chromium"),
+            channel=kwargs.get("channel", "chromium"),
+            proxy=kwargs.get("proxy"),
+            proxy_config=kwargs.get("proxy_config", None),
+            viewport_width=kwargs.get("viewport_width", 1080),
+            viewport_height=kwargs.get("viewport_height", 600),
+            accept_downloads=kwargs.get("accept_downloads", False),
+            downloads_path=kwargs.get("downloads_path"),
+            storage_state=kwargs.get("storage_state"),
+            ignore_https_errors=kwargs.get("ignore_https_errors", True),
+            java_script_enabled=kwargs.get("java_script_enabled", True),
+            cookies=kwargs.get("cookies", []),
+            headers=kwargs.get("headers", {}),
+            user_agent=kwargs.get(
+                "user_agent",
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
+            ),
+            user_agent_mode=kwargs.get("user_agent_mode"),
+            user_agent_generator_config=kwargs.get("user_agent_generator_config"),
+            text_mode=kwargs.get("text_mode", False),
+            light_mode=kwargs.get("light_mode", False),
+            extra_args=kwargs.get("extra_args", []),
+            debugging_port=kwargs.get("debugging_port", 9222),
+            host=kwargs.get("host", "localhost"),
+        )
+
+    def to_dict(self):
+        result = {
+            "browser_type": self.browser_type,
+            "headless": self.headless,
+            "browser_mode": self.browser_mode,
+            "use_managed_browser": self.use_managed_browser,
+            "cdp_url": self.cdp_url,
+            "use_persistent_context": self.use_persistent_context,
+            "user_data_dir": self.user_data_dir,
+            "chrome_channel": self.chrome_channel,
+            "channel": self.channel,
+            "proxy": self.proxy,
+            "proxy_config": self.proxy_config,
+            "viewport_width": self.viewport_width,
+            "viewport_height": self.viewport_height,
+            "accept_downloads": self.accept_downloads,
+            "downloads_path": self.downloads_path,
+            "storage_state": self.storage_state,
+            "ignore_https_errors": self.ignore_https_errors,
+            "java_script_enabled": self.java_script_enabled,
+            "cookies": self.cookies,
+            "headers": self.headers,
+            "user_agent": self.user_agent,
+            "user_agent_mode": self.user_agent_mode,
+            "user_agent_generator_config": self.user_agent_generator_config,
+            "text_mode": self.text_mode,
+            "light_mode": self.light_mode,
+            "extra_args": self.extra_args,
+            "sleep_on_close": self.sleep_on_close,
+            "verbose": self.verbose,
+            "debugging_port": self.debugging_port,
+            "host": self.host,
+        }
+
+                
+        return result
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            BrowserConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return BrowserConfig.from_kwargs(config_dict)
+
+    # Create a funciton returns dict of the object
+    def dump(self) -> dict:
+        # Serialize the object to a dictionary
+        return to_serializable_dict(self)
+
+    @staticmethod
+    def load(data: dict) -> "BrowserConfig":
+        # Deserialize the object from a dictionary
+        config = from_serializable_dict(data)
+        if isinstance(config, BrowserConfig):
+            return config
+        return BrowserConfig.from_kwargs(config)
+
+
+class HTTPCrawlerConfig:
+    """HTTP-specific crawler configuration"""
+
+    method: str = "GET"
+    headers: Optional[Dict[str, str]] = None
+    data: Optional[Dict[str, Any]] = None
+    json: Optional[Dict[str, Any]] = None
+    follow_redirects: bool = True
+    verify_ssl: bool = True
+
+    def __init__(
+        self,
+        method: str = "GET",
+        headers: Optional[Dict[str, str]] = None,
+        data: Optional[Dict[str, Any]] = None,
+        json: Optional[Dict[str, Any]] = None,
+        follow_redirects: bool = True,
+        verify_ssl: bool = True,
+    ):
+        self.method = method
+        self.headers = headers
+        self.data = data
+        self.json = json
+        self.follow_redirects = follow_redirects
+        self.verify_ssl = verify_ssl
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig":
+        return HTTPCrawlerConfig(
+            method=kwargs.get("method", "GET"),
+            headers=kwargs.get("headers"),
+            data=kwargs.get("data"),
+            json=kwargs.get("json"),
+            follow_redirects=kwargs.get("follow_redirects", True),
+            verify_ssl=kwargs.get("verify_ssl", True),
+        )
+
+    def to_dict(self):
+        return {
+            "method": self.method,
+            "headers": self.headers,
+            "data": self.data,
+            "json": self.json,
+            "follow_redirects": self.follow_redirects,
+            "verify_ssl": self.verify_ssl,
+        }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            HTTPCrawlerConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return HTTPCrawlerConfig.from_kwargs(config_dict)
+
+    def dump(self) -> dict:
+        return to_serializable_dict(self)
+
+    @staticmethod
+    def load(data: dict) -> "HTTPCrawlerConfig":
+        config = from_serializable_dict(data)
+        if isinstance(config, HTTPCrawlerConfig):
+            return config
+        return HTTPCrawlerConfig.from_kwargs(config)
+
+class CrawlerRunConfig():
+    _UNWANTED_PROPS = {
+        'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED',
+        'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS',
+        'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY',
+        'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY',
+    }
+
+    """
+    Configuration class for controlling how the crawler runs each crawl operation.
+    This includes parameters for content extraction, page manipulation, waiting conditions,
+    caching, and other runtime behaviors.
+
+    This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods.
+    By using this class, you have a single place to understand and adjust the crawling options.
+
+    Attributes:
+        # Deep Crawl Parameters
+        deep_crawl_strategy (DeepCrawlStrategy or None): Strategy to use for deep crawling.
+
+        # Content Processing Parameters
+        word_count_threshold (int): Minimum word count threshold before processing content.
+                                    Default: MIN_WORD_THRESHOLD (typically 200).
+        extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages.
+                                                          Default: None (NoExtractionStrategy is used if None).
+        chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction.
+                                              Default: RegexChunking().
+        markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
+                                                         Default: None.
+        only_text (bool): If True, attempt to extract text-only content where applicable.
+                          Default: False.
+        css_selector (str or None): CSS selector to extract a specific portion of the page.
+                                    Default: None.
+        
+        target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation 
+                                                and structured data extraction. When you set this, only the contents 
+                                                of these elements are processed for extraction and Markdown generation. 
+                                                If you do not set any value, the entire page is processed. 
+                                                The difference between this and css_selector is that this will shrink 
+                                                the initial raw HTML to the selected element, while this will only affect 
+                                                the extraction and Markdown generation.
+                                    Default: None
+        excluded_tags (list of str or None): List of HTML tags to exclude from processing.
+                                             Default: None.
+        excluded_selector (str or None): CSS selector to exclude from processing.
+                                         Default: None.
+        keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
+                                     Default: False.
+        keep_attrs (list of str): List of HTML attributes to keep during processing.
+                                      Default: [].
+        remove_forms (bool): If True, remove all `<form>` elements from the HTML.
+                             Default: False.
+        prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
+                          Default: False.
+        parser_type (str): Type of parser to use for HTML parsing.
+                           Default: "lxml".
+        scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
+                           Default: WebScrapingStrategy.
+        proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
+                                     If None, no additional proxy config. Default: None.
+
+        # SSL Parameters
+        fetch_ssl_certificate: bool = False,
+        # Caching Parameters
+        cache_mode (CacheMode or None): Defines how caching is handled.
+                                        If None, defaults to CacheMode.ENABLED internally.
+                                        Default: CacheMode.BYPASS.
+        session_id (str or None): Optional session ID to persist the browser context and the created
+                                  page instance. If the ID already exists, the crawler does not
+                                  create a new page and uses the current page to preserve the state.
+        bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS.
+                             Default: False.
+        disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED.
+                              Default: False.
+        no_cache_read (bool): Legacy parameter, if True acts like CacheMode.WRITE_ONLY.
+                              Default: False.
+        no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
+                               Default: False.
+        shared_data (dict or None): Shared data to be passed between hooks.
+                                     Default: None.
+
+        # Page Navigation and Timing Parameters
+        wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
+                          Default: "domcontentloaded".
+        page_timeout (int): Timeout in ms for page operations like navigation.
+                            Default: 60000 (60 seconds).
+        wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
+                                Default: None.
+        wait_for_images (bool): If True, wait for images to load before extracting content.
+                                Default: False.
+        delay_before_return_html (float): Delay in seconds before retrieving final HTML.
+                                          Default: 0.1.
+        mean_delay (float): Mean base delay between requests when calling arun_many.
+                            Default: 0.1.
+        max_range (float): Max random additional delay range for requests in arun_many.
+                           Default: 0.3.
+        semaphore_count (int): Number of concurrent operations allowed.
+                               Default: 5.
+
+        # Page Interaction Parameters
+        js_code (str or list of str or None): JavaScript code/snippets to run on the page.
+                                              Default: None.
+        js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
+                        Default: False.
+        ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding.
+                                       Default: True.
+        scan_full_page (bool): If True, scroll through the entire page to load all content.
+                               Default: False.
+        scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
+                              Default: 0.2.
+        process_iframes (bool): If True, attempts to process and inline iframe content.
+                                Default: False.
+        remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
+                                        Default: False.
+        simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
+                              Default: False.
+        override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
+                                   Default: False.
+        magic (bool): If True, attempts automatic handling of overlays/popups.
+                      Default: False.
+        adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions.
+                                           Default: False.
+
+        # Media Handling Parameters
+        screenshot (bool): Whether to take a screenshot after crawling.
+                           Default: False.
+        screenshot_wait_for (float or None): Additional wait time before taking a screenshot.
+                                             Default: None.
+        screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy.
+                                           Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000).
+        pdf (bool): Whether to generate a PDF of the page.
+                    Default: False.
+        image_description_min_word_threshold (int): Minimum words for image description extraction.
+                                                    Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50).
+        image_score_threshold (int): Minimum score threshold for processing an image.
+                                     Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
+        exclude_external_images (bool): If True, exclude all external images from processing.
+                                         Default: False.
+        table_score_threshold (int): Minimum score threshold for processing a table.
+                                     Default: 7.
+
+        # Link and Domain Handling Parameters
+        exclude_social_media_domains (list of str): List of domains to exclude for social media links.
+                                                    Default: SOCIAL_MEDIA_DOMAINS (from config).
+        exclude_external_links (bool): If True, exclude all external links from the results.
+                                       Default: False.
+        exclude_internal_links (bool): If True, exclude internal links from the results.
+                                       Default: False.
+        exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
+                                           Default: False.
+        exclude_domains (list of str): List of specific domains to exclude from results.
+                                       Default: [].
+        exclude_internal_links (bool): If True, exclude internal links from the results.
+                                       Default: False.
+
+        # Debugging and Logging Parameters
+        verbose (bool): Enable verbose logging.
+                        Default: True.
+        log_console (bool): If True, log console messages from the page.
+                            Default: False.
+
+        # HTTP Crwler Strategy Parameters
+        method (str): HTTP method to use for the request, when using AsyncHTTPCrwalerStrategy.
+                        Default: "GET".
+        data (dict): Data to send in the request body, when using AsyncHTTPCrwalerStrategy.
+                        Default: None.
+        json (dict): JSON data to send in the request body, when using AsyncHTTPCrwalerStrategy.
+
+        # Connection Parameters
+        stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
+                      Default: False.
+
+        check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
+                                 Default: False.
+        user_agent (str): Custom User-Agent string to use.
+                          Default: None.
+        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is.
+                                       Default: None.
+        user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
+                                                    Default: None.
+
+        # Experimental Parameters
+        experimental (dict): Dictionary containing experimental parameters that are in beta phase.
+                            This allows passing temporary features that are not yet fully integrated 
+                            into the main parameter set.
+                            Default: None.
+
+        url: str = None  # This is not a compulsory parameter
+    """
+
+    def __init__(
+        self,
+        # Content Processing Parameters
+        word_count_threshold: int = MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        markdown_generator: MarkdownGenerationStrategy = DefaultMarkdownGenerator(),
+        only_text: bool = False,
+        css_selector: str = None,
+        target_elements: List[str] = None,
+        excluded_tags: list = None,
+        excluded_selector: str = None,
+        keep_data_attributes: bool = False,
+        keep_attrs: list = None,
+        remove_forms: bool = False,
+        prettiify: bool = False,
+        parser_type: str = "lxml",
+        scraping_strategy: ContentScrapingStrategy = None,
+        proxy_config: Union[ProxyConfig, dict, None] = None,
+        proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
+        # SSL Parameters
+        fetch_ssl_certificate: bool = False,
+        # Caching Parameters
+        cache_mode: CacheMode = CacheMode.BYPASS,
+        session_id: str = None,
+        bypass_cache: bool = False,
+        disable_cache: bool = False,
+        no_cache_read: bool = False,
+        no_cache_write: bool = False,
+        shared_data: dict = None,
+        # Page Navigation and Timing Parameters
+        wait_until: str = "domcontentloaded",
+        page_timeout: int = PAGE_TIMEOUT,
+        wait_for: str = None,
+        wait_for_images: bool = False,
+        delay_before_return_html: float = 0.1,
+        mean_delay: float = 0.1,
+        max_range: float = 0.3,
+        semaphore_count: int = 5,
+        # Page Interaction Parameters
+        js_code: Union[str, List[str]] = None,
+        js_only: bool = False,
+        ignore_body_visibility: bool = True,
+        scan_full_page: bool = False,
+        scroll_delay: float = 0.2,
+        process_iframes: bool = False,
+        remove_overlay_elements: bool = False,
+        simulate_user: bool = False,
+        override_navigator: bool = False,
+        magic: bool = False,
+        adjust_viewport_to_content: bool = False,
+        # Media Handling Parameters
+        screenshot: bool = False,
+        screenshot_wait_for: float = None,
+        screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
+        pdf: bool = False,
+        capture_mhtml: bool = False,
+        image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+        image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
+        table_score_threshold: int = 7,
+        exclude_external_images: bool = False,
+        exclude_all_images: bool = False,
+        # Link and Domain Handling Parameters
+        exclude_social_media_domains: list = None,
+        exclude_external_links: bool = False,
+        exclude_social_media_links: bool = False,
+        exclude_domains: list = None,
+        exclude_internal_links: bool = False,
+        # Debugging and Logging Parameters
+        verbose: bool = True,
+        log_console: bool = False,
+        # Network and Console Capturing Parameters
+        capture_network_requests: bool = False,
+        capture_console_messages: bool = False,
+        # Connection Parameters
+        method: str = "GET",
+        stream: bool = False,
+        url: str = None,
+        check_robots_txt: bool = False,
+        user_agent: str = None,
+        user_agent_mode: str = None,
+        user_agent_generator_config: dict = {},
+        # Deep Crawl Parameters
+        deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
+        # Experimental Parameters
+        experimental: Dict[str, Any] = None,
+    ):
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        self.url = url
+
+        # Content Processing Parameters
+        self.word_count_threshold = word_count_threshold
+        self.extraction_strategy = extraction_strategy
+        self.chunking_strategy = chunking_strategy
+        self.markdown_generator = markdown_generator
+        self.only_text = only_text
+        self.css_selector = css_selector
+        self.target_elements = target_elements or []
+        self.excluded_tags = excluded_tags or []
+        self.excluded_selector = excluded_selector or ""
+        self.keep_data_attributes = keep_data_attributes
+        self.keep_attrs = keep_attrs or []
+        self.remove_forms = remove_forms
+        self.prettiify = prettiify
+        self.parser_type = parser_type
+        self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
+        self.proxy_config = proxy_config
+        self.proxy_rotation_strategy = proxy_rotation_strategy
+
+        # SSL Parameters
+        self.fetch_ssl_certificate = fetch_ssl_certificate
+
+        # Caching Parameters
+        self.cache_mode = cache_mode
+        self.session_id = session_id
+        self.bypass_cache = bypass_cache
+        self.disable_cache = disable_cache
+        self.no_cache_read = no_cache_read
+        self.no_cache_write = no_cache_write
+        self.shared_data = shared_data
+
+        # Page Navigation and Timing Parameters
+        self.wait_until = wait_until
+        self.page_timeout = page_timeout
+        self.wait_for = wait_for
+        self.wait_for_images = wait_for_images
+        self.delay_before_return_html = delay_before_return_html
+        self.mean_delay = mean_delay
+        self.max_range = max_range
+        self.semaphore_count = semaphore_count
+
+        # Page Interaction Parameters
+        self.js_code = js_code
+        self.js_only = js_only
+        self.ignore_body_visibility = ignore_body_visibility
+        self.scan_full_page = scan_full_page
+        self.scroll_delay = scroll_delay
+        self.process_iframes = process_iframes
+        self.remove_overlay_elements = remove_overlay_elements
+        self.simulate_user = simulate_user
+        self.override_navigator = override_navigator
+        self.magic = magic
+        self.adjust_viewport_to_content = adjust_viewport_to_content
+
+        # Media Handling Parameters
+        self.screenshot = screenshot
+        self.screenshot_wait_for = screenshot_wait_for
+        self.screenshot_height_threshold = screenshot_height_threshold
+        self.pdf = pdf
+        self.capture_mhtml = capture_mhtml
+        self.image_description_min_word_threshold = image_description_min_word_threshold
+        self.image_score_threshold = image_score_threshold
+        self.exclude_external_images = exclude_external_images
+        self.exclude_all_images = exclude_all_images
+        self.table_score_threshold = table_score_threshold
+
+        # Link and Domain Handling Parameters
+        self.exclude_social_media_domains = (
+            exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
+        )
+        self.exclude_external_links = exclude_external_links
+        self.exclude_social_media_links = exclude_social_media_links
+        self.exclude_domains = exclude_domains or []
+        self.exclude_internal_links = exclude_internal_links
+
+        # Debugging and Logging Parameters
+        self.verbose = verbose
+        self.log_console = log_console
+        
+        # Network and Console Capturing Parameters
+        self.capture_network_requests = capture_network_requests
+        self.capture_console_messages = capture_console_messages
+
+        # Connection Parameters
+        self.stream = stream
+        self.method = method
+
+        # Robots.txt Handling Parameters
+        self.check_robots_txt = check_robots_txt
+
+        # User Agent Parameters
+        self.user_agent = user_agent
+        self.user_agent_mode = user_agent_mode
+        self.user_agent_generator_config = user_agent_generator_config
+
+        # Validate type of extraction strategy and chunking strategy if they are provided
+        if self.extraction_strategy is not None and not isinstance(
+            self.extraction_strategy, ExtractionStrategy
+        ):
+            raise ValueError(
+                "extraction_strategy must be an instance of ExtractionStrategy"
+            )
+        if self.chunking_strategy is not None and not isinstance(
+            self.chunking_strategy, ChunkingStrategy
+        ):
+            raise ValueError(
+                "chunking_strategy must be an instance of ChunkingStrategy"
+            )
+
+        # Set default chunking strategy if None
+        if self.chunking_strategy is None:
+            self.chunking_strategy = RegexChunking()
+
+        # Deep Crawl Parameters
+        self.deep_crawl_strategy = deep_crawl_strategy
+        
+        # Experimental Parameters
+        self.experimental = experimental or {}
+
+
+    def __getattr__(self, name):
+        """Handle attribute access."""
+        if name in self._UNWANTED_PROPS:
+            raise AttributeError(f"Getting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        raise AttributeError(f"'{self.__class__.__name__}' has no attribute '{name}'")
+
+    def __setattr__(self, name, value):
+        """Handle attribute setting."""
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        sig = inspect.signature(self.__init__)
+        all_params = sig.parameters  # Dictionary of parameter names and their details
+
+        if name in self._UNWANTED_PROPS and value is not all_params[name].default:
+            raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        
+        super().__setattr__(name, value)
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
+        return CrawlerRunConfig(
+            # Content Processing Parameters
+            word_count_threshold=kwargs.get("word_count_threshold", 200),
+            extraction_strategy=kwargs.get("extraction_strategy"),
+            chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
+            markdown_generator=kwargs.get("markdown_generator"),
+            only_text=kwargs.get("only_text", False),
+            css_selector=kwargs.get("css_selector"),
+            target_elements=kwargs.get("target_elements", []),
+            excluded_tags=kwargs.get("excluded_tags", []),
+            excluded_selector=kwargs.get("excluded_selector", ""),
+            keep_data_attributes=kwargs.get("keep_data_attributes", False),
+            keep_attrs=kwargs.get("keep_attrs", []),
+            remove_forms=kwargs.get("remove_forms", False),
+            prettiify=kwargs.get("prettiify", False),
+            parser_type=kwargs.get("parser_type", "lxml"),
+            scraping_strategy=kwargs.get("scraping_strategy"),
+            proxy_config=kwargs.get("proxy_config"),
+            proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
+            # SSL Parameters
+            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
+            # Caching Parameters
+            cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
+            session_id=kwargs.get("session_id"),
+            bypass_cache=kwargs.get("bypass_cache", False),
+            disable_cache=kwargs.get("disable_cache", False),
+            no_cache_read=kwargs.get("no_cache_read", False),
+            no_cache_write=kwargs.get("no_cache_write", False),
+            shared_data=kwargs.get("shared_data", None),
+            # Page Navigation and Timing Parameters
+            wait_until=kwargs.get("wait_until", "domcontentloaded"),
+            page_timeout=kwargs.get("page_timeout", 60000),
+            wait_for=kwargs.get("wait_for"),
+            wait_for_images=kwargs.get("wait_for_images", False),
+            delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
+            mean_delay=kwargs.get("mean_delay", 0.1),
+            max_range=kwargs.get("max_range", 0.3),
+            semaphore_count=kwargs.get("semaphore_count", 5),
+            # Page Interaction Parameters
+            js_code=kwargs.get("js_code"),
+            js_only=kwargs.get("js_only", False),
+            ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
+            scan_full_page=kwargs.get("scan_full_page", False),
+            scroll_delay=kwargs.get("scroll_delay", 0.2),
+            process_iframes=kwargs.get("process_iframes", False),
+            remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
+            simulate_user=kwargs.get("simulate_user", False),
+            override_navigator=kwargs.get("override_navigator", False),
+            magic=kwargs.get("magic", False),
+            adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
+            # Media Handling Parameters
+            screenshot=kwargs.get("screenshot", False),
+            screenshot_wait_for=kwargs.get("screenshot_wait_for"),
+            screenshot_height_threshold=kwargs.get(
+                "screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD
+            ),
+            pdf=kwargs.get("pdf", False),
+            capture_mhtml=kwargs.get("capture_mhtml", False),
+            image_description_min_word_threshold=kwargs.get(
+                "image_description_min_word_threshold",
+                IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+            ),
+            image_score_threshold=kwargs.get(
+                "image_score_threshold", IMAGE_SCORE_THRESHOLD
+            ),
+            table_score_threshold=kwargs.get("table_score_threshold", 7),
+            exclude_all_images=kwargs.get("exclude_all_images", False),
+            exclude_external_images=kwargs.get("exclude_external_images", False),
+            # Link and Domain Handling Parameters
+            exclude_social_media_domains=kwargs.get(
+                "exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS
+            ),
+            exclude_external_links=kwargs.get("exclude_external_links", False),
+            exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
+            exclude_domains=kwargs.get("exclude_domains", []),
+            exclude_internal_links=kwargs.get("exclude_internal_links", False),
+            # Debugging and Logging Parameters
+            verbose=kwargs.get("verbose", True),
+            log_console=kwargs.get("log_console", False),
+            # Network and Console Capturing Parameters
+            capture_network_requests=kwargs.get("capture_network_requests", False),
+            capture_console_messages=kwargs.get("capture_console_messages", False),
+            # Connection Parameters
+            method=kwargs.get("method", "GET"),
+            stream=kwargs.get("stream", False),
+            check_robots_txt=kwargs.get("check_robots_txt", False),
+            user_agent=kwargs.get("user_agent"),
+            user_agent_mode=kwargs.get("user_agent_mode"),
+            user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
+            # Deep Crawl Parameters
+            deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
+            url=kwargs.get("url"),
+            # Experimental Parameters 
+            experimental=kwargs.get("experimental"),
+        )
+
+    # Create a funciton returns dict of the object
+    def dump(self) -> dict:
+        # Serialize the object to a dictionary
+        return to_serializable_dict(self)
+
+    @staticmethod
+    def load(data: dict) -> "CrawlerRunConfig":
+        # Deserialize the object from a dictionary
+        config = from_serializable_dict(data)
+        if isinstance(config, CrawlerRunConfig):
+            return config
+        return CrawlerRunConfig.from_kwargs(config)
+
+    def to_dict(self):
+        return {
+            "word_count_threshold": self.word_count_threshold,
+            "extraction_strategy": self.extraction_strategy,
+            "chunking_strategy": self.chunking_strategy,
+            "markdown_generator": self.markdown_generator,
+            "only_text": self.only_text,
+            "css_selector": self.css_selector,
+            "target_elements": self.target_elements,
+            "excluded_tags": self.excluded_tags,
+            "excluded_selector": self.excluded_selector,
+            "keep_data_attributes": self.keep_data_attributes,
+            "keep_attrs": self.keep_attrs,
+            "remove_forms": self.remove_forms,
+            "prettiify": self.prettiify,
+            "parser_type": self.parser_type,
+            "scraping_strategy": self.scraping_strategy,
+            "proxy_config": self.proxy_config,
+            "proxy_rotation_strategy": self.proxy_rotation_strategy,
+            "fetch_ssl_certificate": self.fetch_ssl_certificate,
+            "cache_mode": self.cache_mode,
+            "session_id": self.session_id,
+            "bypass_cache": self.bypass_cache,
+            "disable_cache": self.disable_cache,
+            "no_cache_read": self.no_cache_read,
+            "no_cache_write": self.no_cache_write,
+            "shared_data": self.shared_data,
+            "wait_until": self.wait_until,
+            "page_timeout": self.page_timeout,
+            "wait_for": self.wait_for,
+            "wait_for_images": self.wait_for_images,
+            "delay_before_return_html": self.delay_before_return_html,
+            "mean_delay": self.mean_delay,
+            "max_range": self.max_range,
+            "semaphore_count": self.semaphore_count,
+            "js_code": self.js_code,
+            "js_only": self.js_only,
+            "ignore_body_visibility": self.ignore_body_visibility,
+            "scan_full_page": self.scan_full_page,
+            "scroll_delay": self.scroll_delay,
+            "process_iframes": self.process_iframes,
+            "remove_overlay_elements": self.remove_overlay_elements,
+            "simulate_user": self.simulate_user,
+            "override_navigator": self.override_navigator,
+            "magic": self.magic,
+            "adjust_viewport_to_content": self.adjust_viewport_to_content,
+            "screenshot": self.screenshot,
+            "screenshot_wait_for": self.screenshot_wait_for,
+            "screenshot_height_threshold": self.screenshot_height_threshold,
+            "pdf": self.pdf,
+            "capture_mhtml": self.capture_mhtml,
+            "image_description_min_word_threshold": self.image_description_min_word_threshold,
+            "image_score_threshold": self.image_score_threshold,
+            "table_score_threshold": self.table_score_threshold,
+            "exclude_all_images": self.exclude_all_images,
+            "exclude_external_images": self.exclude_external_images,
+            "exclude_social_media_domains": self.exclude_social_media_domains,
+            "exclude_external_links": self.exclude_external_links,
+            "exclude_social_media_links": self.exclude_social_media_links,
+            "exclude_domains": self.exclude_domains,
+            "exclude_internal_links": self.exclude_internal_links,
+            "verbose": self.verbose,
+            "log_console": self.log_console,
+            "capture_network_requests": self.capture_network_requests,
+            "capture_console_messages": self.capture_console_messages,
+            "method": self.method,
+            "stream": self.stream,
+            "check_robots_txt": self.check_robots_txt,
+            "user_agent": self.user_agent,
+            "user_agent_mode": self.user_agent_mode,
+            "user_agent_generator_config": self.user_agent_generator_config,
+            "deep_crawl_strategy": self.deep_crawl_strategy,
+            "url": self.url,
+            "experimental": self.experimental,
+        }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            CrawlerRunConfig: A new instance with the specified updates
+
+        Example:
+            ```python
+            # Create a new config with streaming enabled
+            stream_config = config.clone(stream=True)
+
+            # Create a new config with multiple updates
+            new_config = config.clone(
+                stream=True,
+                cache_mode=CacheMode.BYPASS,
+                verbose=True
+            )
+            ```
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return CrawlerRunConfig.from_kwargs(config_dict)
+
+
+class LLMConfig:
+    def __init__(
+        self,
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        base_url: Optional[str] = None,
+        temprature: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        stop: Optional[List[str]] = None,
+        n: Optional[int] = None,    
+    ):
+        """Configuaration class for LLM provider and API token."""
+        self.provider = provider
+        if api_token and not api_token.startswith("env:"):
+            self.api_token = api_token
+        elif api_token and api_token.startswith("env:"):
+            self.api_token = os.getenv(api_token[4:])
+        else:
+            # Check if given provider starts with any of key in PROVIDER_MODELS_PREFIXES
+            # If not, check if it is in PROVIDER_MODELS
+            prefixes = PROVIDER_MODELS_PREFIXES.keys()
+            if any(provider.startswith(prefix) for prefix in prefixes):
+                selected_prefix = next(
+                    (prefix for prefix in prefixes if provider.startswith(prefix)),
+                    None,
+                )
+                self.api_token = PROVIDER_MODELS_PREFIXES.get(selected_prefix)                    
+            else:
+                self.provider = DEFAULT_PROVIDER
+                self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
+        self.base_url = base_url
+        self.temprature = temprature
+        self.max_tokens = max_tokens
+        self.top_p = top_p
+        self.frequency_penalty = frequency_penalty
+        self.presence_penalty = presence_penalty
+        self.stop = stop
+        self.n = n
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "LLMConfig":
+        return LLMConfig(
+            provider=kwargs.get("provider", DEFAULT_PROVIDER),
+            api_token=kwargs.get("api_token"),
+            base_url=kwargs.get("base_url"),
+            temprature=kwargs.get("temprature"),
+            max_tokens=kwargs.get("max_tokens"),
+            top_p=kwargs.get("top_p"),
+            frequency_penalty=kwargs.get("frequency_penalty"),
+            presence_penalty=kwargs.get("presence_penalty"),
+            stop=kwargs.get("stop"),
+            n=kwargs.get("n")
+        )
+
+    def to_dict(self):
+        return {
+            "provider": self.provider,
+            "api_token": self.api_token,
+            "base_url": self.base_url,
+            "temprature": self.temprature,
+            "max_tokens": self.max_tokens,
+            "top_p": self.top_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+            "stop": self.stop,
+            "n": self.n
+        }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            llm_config: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return LLMConfig.from_kwargs(config_dict)
+
+
+
+```
+
+
+## File: crawl4ai/async_webcrawler.py
+
+```py
+from .__version__ import __version__ as crawl4ai_version
+import os
+import sys
+import time
+from colorama import Fore
+from pathlib import Path
+from typing import Optional, List
+import json
+import asyncio
+
+# from contextlib import nullcontext, asynccontextmanager
+from contextlib import asynccontextmanager
+from .models import (
+    CrawlResult,
+    MarkdownGenerationResult,
+    DispatchResult,
+    ScrapingResult,
+    CrawlResultContainer,
+    RunManyReturn
+)
+from .async_database import async_db_manager
+from .chunking_strategy import *  # noqa: F403
+from .chunking_strategy import IdentityChunking
+from .content_filter_strategy import *  # noqa: F403
+from .extraction_strategy import *  # noqa: F403
+from .extraction_strategy import NoExtractionStrategy
+from .async_crawler_strategy import (
+    AsyncCrawlerStrategy,
+    AsyncPlaywrightCrawlerStrategy,
+    AsyncCrawlResponse,
+)
+from .cache_context import CacheMode, CacheContext
+from .markdown_generation_strategy import (
+    DefaultMarkdownGenerator,
+    MarkdownGenerationStrategy,
+)
+from .deep_crawling import DeepCrawlDecorator
+from .async_logger import AsyncLogger, AsyncLoggerBase
+from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig
+from .async_dispatcher import *  # noqa: F403
+from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
+
+from .utils import (
+    sanitize_input_encode,
+    InvalidCSSSelectorError,
+    fast_format_html,
+    create_box_message,
+    get_error_context,
+    RobotsParser,
+    preprocess_html_for_schema,
+)
+
+
+class AsyncWebCrawler:
+    """
+    Asynchronous web crawler with flexible caching capabilities.
+
+    There are two ways to use the crawler:
+
+    1. Using context manager (recommended for simple cases):
+        ```python
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url="https://example.com")
+        ```
+
+    2. Using explicit lifecycle management (recommended for long-running applications):
+        ```python
+        crawler = AsyncWebCrawler()
+        await crawler.start()
+
+        # Use the crawler multiple times
+        result1 = await crawler.arun(url="https://example.com")
+        result2 = await crawler.arun(url="https://another.com")
+
+        await crawler.close()
+        ```
+
+    Attributes:
+        browser_config (BrowserConfig): Configuration object for browser settings.
+        crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
+        logger (AsyncLogger): Logger instance for recording events and errors.
+        crawl4ai_folder (str): Directory for storing cache.
+        base_directory (str): Base directory for storing cache.
+        ready (bool): Whether the crawler is ready for use.
+
+    Methods:
+        start(): Start the crawler explicitly without using context manager.
+        close(): Close the crawler explicitly without using context manager.
+        arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
+        awarmup(): Perform warmup sequence.
+        arun_many(): Run the crawler for multiple sources.
+        aprocess_html(): Process HTML content.
+
+    Typical Usage:
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url="https://example.com")
+            print(result.markdown)
+
+        Using configuration:
+        browser_config = BrowserConfig(browser_type="chromium", headless=True)
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS
+            )
+            result = await crawler.arun(url="https://example.com", config=crawler_config)
+            print(result.markdown)
+    """
+
+    _domain_last_hit = {}
+
+    def __init__(
+        self,
+        crawler_strategy: AsyncCrawlerStrategy = None,
+        config: BrowserConfig = None,
+        base_directory: str = str(
+            os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
+        thread_safe: bool = False,
+        logger: AsyncLoggerBase = None,
+        **kwargs,
+    ):
+        """
+        Initialize the AsyncWebCrawler.
+
+        Args:
+            crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy
+            config: Configuration object for browser settings. Default BrowserConfig()
+            base_directory: Base directory for storing cache
+            thread_safe: Whether to use thread-safe operations
+            **kwargs: Additional arguments for backwards compatibility
+        """
+        # Handle browser configuration
+        browser_config = config or BrowserConfig()
+
+        self.browser_config = browser_config
+
+        # Initialize logger first since other components may need it
+        self.logger = logger or AsyncLogger(
+            log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
+            verbose=self.browser_config.verbose,
+            tag_width=10,
+        )
+
+        # Initialize crawler strategy
+        params = {k: v for k, v in kwargs.items() if k in [
+            "browser_config", "logger"]}
+        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
+            browser_config=browser_config,
+            logger=self.logger,
+            **params,  # Pass remaining kwargs for backwards compatibility
+        )
+
+        # Thread safety setup
+        self._lock = asyncio.Lock() if thread_safe else None
+
+        # Initialize directories
+        self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
+        os.makedirs(self.crawl4ai_folder, exist_ok=True)
+        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
+
+        # Initialize robots parser
+        self.robots_parser = RobotsParser()
+
+        self.ready = False
+
+        # Decorate arun method with deep crawling capabilities
+        self._deep_handler = DeepCrawlDecorator(self)
+        self.arun = self._deep_handler(self.arun)
+
+    async def start(self):
+        """
+        Start the crawler explicitly without using context manager.
+        This is equivalent to using 'async with' but gives more control over the lifecycle.
+        Returns:
+            AsyncWebCrawler: The initialized crawler instance
+        """
+        await self.crawler_strategy.__aenter__()
+        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
+        self.ready = True
+        return self
+
+    async def close(self):
+        """
+        Close the crawler explicitly without using context manager.
+        This should be called when you're done with the crawler if you used start().
+
+        This method will:
+        1. Clean up browser resources
+        2. Close any open pages and contexts
+        """
+        await self.crawler_strategy.__aexit__(None, None, None)
+
+    async def __aenter__(self):
+        return await self.start()
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
+
+    @asynccontextmanager
+    async def nullcontext(self):
+        """异步空上下文管理器"""
+        yield
+
+    async def arun(
+        self,
+        url: str,
+        config: CrawlerRunConfig = None,
+        **kwargs,
+    ) -> RunManyReturn:
+        """
+        Runs the crawler for a single source: URL (web, local file, or raw HTML).
+
+        Migration Guide:
+        Old way (deprecated):
+            result = await crawler.arun(
+                url="https://example.com",
+                word_count_threshold=200,
+                screenshot=True,
+                ...
+            )
+
+        New way (recommended):
+            config = CrawlerRunConfig(
+                word_count_threshold=200,
+                screenshot=True,
+                ...
+            )
+            result = await crawler.arun(url="https://example.com", crawler_config=config)
+
+        Args:
+            url: The URL to crawl (http://, https://, file://, or raw:)
+            crawler_config: Configuration object controlling crawl behavior
+            [other parameters maintained for backwards compatibility]
+
+        Returns:
+            CrawlResult: The result of crawling and processing
+        """
+        # Auto-start if not ready
+        if not self.ready:
+            await self.start()
+
+        config = config or CrawlerRunConfig()
+        if not isinstance(url, str) or not url:
+            raise ValueError(
+                "Invalid URL, make sure the URL is a non-empty string")
+
+        async with self._lock or self.nullcontext():
+            try:
+                self.logger.verbose = config.verbose
+
+                # Default to ENABLED if no cache mode specified
+                if config.cache_mode is None:
+                    config.cache_mode = CacheMode.ENABLED
+
+                # Create cache context
+                cache_context = CacheContext(url, config.cache_mode, False)
+
+                # Initialize processing variables
+                async_response: AsyncCrawlResponse = None
+                cached_result: CrawlResult = None
+                screenshot_data = None
+                pdf_data = None
+                extracted_content = None
+                start_time = time.perf_counter()
+
+                # Try to get cached result if appropriate
+                if cache_context.should_read():
+                    cached_result = await async_db_manager.aget_cached_url(url)
+
+                if cached_result:
+                    html = sanitize_input_encode(cached_result.html)
+                    extracted_content = sanitize_input_encode(
+                        cached_result.extracted_content or ""
+                    )
+                    extracted_content = (
+                        None
+                        if not extracted_content or extracted_content == "[]"
+                        else extracted_content
+                    )
+                    # If screenshot is requested but its not in cache, then set cache_result to None
+                    screenshot_data = cached_result.screenshot
+                    pdf_data = cached_result.pdf
+                    # if config.screenshot and not screenshot or config.pdf and not pdf:
+                    if config.screenshot and not screenshot_data:
+                        cached_result = None
+
+                    if config.pdf and not pdf_data:
+                        cached_result = None
+
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=bool(html),
+                        timing=time.perf_counter() - start_time,
+                        tag="FETCH",
+                    )
+
+                # Update proxy configuration from rotation strategy if available
+                if config and config.proxy_rotation_strategy:
+                    next_proxy: ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
+                    if next_proxy:
+                        self.logger.info(
+                            message="Switch proxy: {proxy}",
+                            tag="PROXY",
+                            params={"proxy": next_proxy.server}
+                        )
+                        config.proxy_config = next_proxy
+                        # config = config.clone(proxy_config=next_proxy)
+
+                # Fetch fresh content if needed
+                if not cached_result or not html:
+                    t1 = time.perf_counter()
+
+                    if config.user_agent:
+                        self.crawler_strategy.update_user_agent(
+                            config.user_agent)
+
+                    # Check robots.txt if enabled
+                    if config and config.check_robots_txt:
+                        if not await self.robots_parser.can_fetch(
+                            url, self.browser_config.user_agent
+                        ):
+                            return CrawlResult(
+                                url=url,
+                                html="",
+                                success=False,
+                                status_code=403,
+                                error_message="Access denied by robots.txt",
+                                response_headers={
+                                    "X-Robots-Status": "Blocked by robots.txt"
+                                },
+                            )
+
+                    ##############################
+                    # Call CrawlerStrategy.crawl #
+                    ##############################
+                    async_response = await self.crawler_strategy.crawl(
+                        url,
+                        config=config,  # Pass the entire config object
+                    )
+
+                    html = sanitize_input_encode(async_response.html)
+                    screenshot_data = async_response.screenshot
+                    pdf_data = async_response.pdf_data
+                    js_execution_result = async_response.js_execution_result
+
+                    t2 = time.perf_counter()
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=bool(html),
+                        timing=t2 - t1,
+                        tag="FETCH",
+                    )
+
+                    ###############################################################
+                    # Process the HTML content, Call CrawlerStrategy.process_html #
+                    ###############################################################
+                    crawl_result: CrawlResult = await self.aprocess_html(
+                        url=url,
+                        html=html,
+                        extracted_content=extracted_content,
+                        config=config,  # Pass the config object instead of individual parameters
+                        screenshot=screenshot_data,
+                        pdf_data=pdf_data,
+                        verbose=config.verbose,
+                        is_raw_html=True if url.startswith("raw:") else False,
+                        **kwargs,
+                    )
+
+                    crawl_result.status_code = async_response.status_code
+                    crawl_result.redirected_url = async_response.redirected_url or url
+                    crawl_result.response_headers = async_response.response_headers
+                    crawl_result.downloaded_files = async_response.downloaded_files
+                    crawl_result.js_execution_result = js_execution_result
+                    crawl_result.mhtml = async_response.mhtml_data
+                    crawl_result.ssl_certificate = async_response.ssl_certificate
+                    # Add captured network and console data if available
+                    crawl_result.network_requests = async_response.network_requests
+                    crawl_result.console_messages = async_response.console_messages
+
+                    crawl_result.success = bool(html)
+                    crawl_result.session_id = getattr(
+                        config, "session_id", None)
+
+                    self.logger.success(
+                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                        tag="COMPLETE",
+                        params={
+                            "url": cache_context.display_url,
+                            "status": crawl_result.success,
+                            "timing": f"{time.perf_counter() - start_time:.2f}s",
+                        },
+                        colors={
+                            "status": Fore.GREEN if crawl_result.success else Fore.RED,
+                            "timing": Fore.YELLOW,
+                        },
+                    )
+
+                    # Update cache if appropriate
+                    if cache_context.should_write() and not bool(cached_result):
+                        await async_db_manager.acache_url(crawl_result)
+
+                    return CrawlResultContainer(crawl_result)
+
+                else:
+                    self.logger.success(
+                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                        tag="COMPLETE",
+                        params={
+                            "url": cache_context.display_url,
+                            "status": True,
+                            "timing": f"{time.perf_counter() - start_time:.2f}s",
+                        },
+                        colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
+                    )
+
+                    cached_result.success = bool(html)
+                    cached_result.session_id = getattr(
+                        config, "session_id", None)
+                    cached_result.redirected_url = cached_result.redirected_url or url
+                    return CrawlResultContainer(cached_result)
+
+            except Exception as e:
+                error_context = get_error_context(sys.exc_info())
+
+                error_message = (
+                    f"Unexpected error in _crawl_web at line {error_context['line_no']} "
+                    f"in {error_context['function']} ({error_context['filename']}):\n"
+                    f"Error: {str(e)}\n\n"
+                    f"Code context:\n{error_context['code_context']}"
+                )
+
+                self.logger.error_status(
+                    url=url,
+                    error=create_box_message(error_message, type="error"),
+                    tag="ERROR",
+                )
+
+                return CrawlResultContainer(
+                    CrawlResult(
+                        url=url, html="", success=False, error_message=error_message
+                    )
+                )
+
+    async def aprocess_html(
+        self,
+        url: str,
+        html: str,
+        extracted_content: str,
+        config: CrawlerRunConfig,
+        screenshot: str,
+        pdf_data: str,
+        verbose: bool,
+        **kwargs,
+    ) -> CrawlResult:
+        """
+        Process HTML content using the provided configuration.
+
+        Args:
+            url: The URL being processed
+            html: Raw HTML content
+            extracted_content: Previously extracted content (if any)
+            config: Configuration object controlling processing behavior
+            screenshot: Screenshot data (if any)
+            pdf_data: PDF data (if any)
+            verbose: Whether to enable verbose logging
+            **kwargs: Additional parameters for backwards compatibility
+
+        Returns:
+            CrawlResult: Processed result containing extracted and formatted content
+        """
+        cleaned_html = ""
+        try:
+            _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
+            t1 = time.perf_counter()
+
+            # Get scraping strategy and ensure it has a logger
+            scraping_strategy = config.scraping_strategy
+            if not scraping_strategy.logger:
+                scraping_strategy.logger = self.logger
+
+            # Process HTML content
+            params = config.__dict__.copy()
+            params.pop("url", None)
+            # add keys from kwargs to params that doesn't exist in params
+            params.update({k: v for k, v in kwargs.items()
+                          if k not in params.keys()})
+
+            ################################
+            # Scraping Strategy Execution  #
+            ################################
+            result: ScrapingResult = scraping_strategy.scrap(
+                url, html, **params)
+
+            if result is None:
+                raise ValueError(
+                    f"Process HTML, Failed to extract content from the website: {url}"
+                )
+
+        except InvalidCSSSelectorError as e:
+            raise ValueError(str(e))
+        except Exception as e:
+            raise ValueError(
+                f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}"
+            )
+
+        # Extract results - handle both dict and ScrapingResult
+        if isinstance(result, dict):
+            cleaned_html = sanitize_input_encode(
+                result.get("cleaned_html", ""))
+            media = result.get("media", {})
+            links = result.get("links", {})
+            metadata = result.get("metadata", {})
+        else:
+            cleaned_html = sanitize_input_encode(result.cleaned_html)
+            media = result.media.model_dump()
+            links = result.links.model_dump()
+            metadata = result.metadata
+
+        ################################
+        # Generate Markdown            #
+        ################################
+        markdown_generator: Optional[MarkdownGenerationStrategy] = (
+            config.markdown_generator or DefaultMarkdownGenerator()
+        )
+
+        # --- SELECT HTML SOURCE BASED ON CONTENT_SOURCE ---
+        # Get the desired source from the generator config, default to 'cleaned_html'
+        selected_html_source = getattr(markdown_generator, 'content_source', 'cleaned_html')
+
+        # Define the source selection logic using dict dispatch
+        html_source_selector = {
+            "raw_html": lambda: html,  # The original raw HTML
+            "cleaned_html": lambda: cleaned_html,  # The HTML after scraping strategy
+            "fit_html": lambda: preprocess_html_for_schema(html_content=html),  # Preprocessed raw HTML
+        }
+
+        markdown_input_html = cleaned_html  # Default to cleaned_html
+
+        try:
+            # Get the appropriate lambda function, default to returning cleaned_html if key not found
+            source_lambda = html_source_selector.get(selected_html_source, lambda: cleaned_html)
+            # Execute the lambda to get the selected HTML
+            markdown_input_html = source_lambda()
+
+            # Log which source is being used (optional, but helpful for debugging)
+            # if self.logger and verbose:
+            #     actual_source_used = selected_html_source if selected_html_source in html_source_selector else 'cleaned_html (default)'
+            #     self.logger.debug(f"Using '{actual_source_used}' as source for Markdown generation for {url}", tag="MARKDOWN_SRC")
+
+        except Exception as e:
+            # Handle potential errors, especially from preprocess_html_for_schema
+            if self.logger:
+                self.logger.warning(
+                    f"Error getting/processing '{selected_html_source}' for markdown source: {e}. Falling back to cleaned_html.",
+                    tag="MARKDOWN_SRC"
+                )
+            # Ensure markdown_input_html is still the default cleaned_html in case of error
+            markdown_input_html = cleaned_html
+        # --- END: HTML SOURCE SELECTION ---
+
+        # Uncomment if by default we want to use PruningContentFilter
+        # if not config.content_filter and not markdown_generator.content_filter:
+        #     markdown_generator.content_filter = PruningContentFilter()
+
+        markdown_result: MarkdownGenerationResult = (
+            markdown_generator.generate_markdown(
+                input_html=markdown_input_html,
+                base_url=url,
+                # html2text_options=kwargs.get('html2text', {})
+            )
+        )
+
+        # Log processing completion
+        self.logger.info(
+            message="{url:.50}... | Time: {timing}s",
+            tag="SCRAPE",
+            params={
+                "url": _url,
+                "timing": int((time.perf_counter() - t1) * 1000) / 1000,
+            },
+        )
+
+        ################################
+        # Structured Content Extraction           #
+        ################################
+        if (
+            not bool(extracted_content)
+            and config.extraction_strategy
+            and not isinstance(config.extraction_strategy, NoExtractionStrategy)
+        ):
+            t1 = time.perf_counter()
+            # Choose content based on input_format
+            content_format = config.extraction_strategy.input_format
+            if content_format == "fit_markdown" and not markdown_result.fit_markdown:
+                self.logger.warning(
+                    message="Fit markdown requested but not available. Falling back to raw markdown.",
+                    tag="EXTRACT",
+                    params={"url": _url},
+                )
+                content_format = "markdown"
+
+            content = {
+                "markdown": markdown_result.raw_markdown,
+                "html": html,
+                "cleaned_html": cleaned_html,
+                "fit_markdown": markdown_result.fit_markdown,
+            }.get(content_format, markdown_result.raw_markdown)
+
+            # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
+            chunking = (
+                IdentityChunking()
+                if content_format in ["html", "cleaned_html"]
+                else config.chunking_strategy
+            )
+            sections = chunking.chunk(content)
+            extracted_content = config.extraction_strategy.run(url, sections)
+            extracted_content = json.dumps(
+                extracted_content, indent=4, default=str, ensure_ascii=False
+            )
+
+            # Log extraction completion
+            self.logger.info(
+                message="Completed for {url:.50}... | Time: {timing}s",
+                tag="EXTRACT",
+                params={"url": _url, "timing": time.perf_counter() - t1},
+            )
+
+        # Handle screenshot and PDF data
+        screenshot_data = None if not screenshot else screenshot
+        pdf_data = None if not pdf_data else pdf_data
+
+        # Apply HTML formatting if requested
+        if config.prettiify:
+            cleaned_html = fast_format_html(cleaned_html)
+
+        # Return complete crawl result
+        return CrawlResult(
+            url=url,
+            html=html,
+            cleaned_html=cleaned_html,
+            markdown=markdown_result,
+            media=media,
+            links=links,
+            metadata=metadata,
+            screenshot=screenshot_data,
+            pdf=pdf_data,
+            extracted_content=extracted_content,
+            success=True,
+            error_message="",
+        )
+
+    async def arun_many(
+        self,
+        urls: List[str],
+        config: Optional[CrawlerRunConfig] = None,
+        dispatcher: Optional[BaseDispatcher] = None,
+        # Legacy parameters maintained for backwards compatibility
+        # word_count_threshold=MIN_WORD_THRESHOLD,
+        # extraction_strategy: ExtractionStrategy = None,
+        # chunking_strategy: ChunkingStrategy = RegexChunking(),
+        # content_filter: RelevantContentFilter = None,
+        # cache_mode: Optional[CacheMode] = None,
+        # bypass_cache: bool = False,
+        # css_selector: str = None,
+        # screenshot: bool = False,
+        # pdf: bool = False,
+        # user_agent: str = None,
+        # verbose=True,
+        **kwargs,
+    ) -> RunManyReturn:
+        """
+        Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
+
+        Args:
+        urls: List of URLs to crawl
+        config: Configuration object controlling crawl behavior for all URLs
+        dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher
+        [other parameters maintained for backwards compatibility]
+
+        Returns:
+        Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
+            Either a list of all results or an async generator yielding results
+
+        Examples:
+
+        # Batch processing (default)
+        results = await crawler.arun_many(
+            urls=["https://example1.com", "https://example2.com"],
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        for result in results:
+            print(f"Processed {result.url}: {len(result.markdown)} chars")
+
+        # Streaming results
+        async for result in await crawler.arun_many(
+            urls=["https://example1.com", "https://example2.com"],
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True),
+        ):
+            print(f"Processed {result.url}: {len(result.markdown)} chars")
+        """
+        config = config or CrawlerRunConfig()
+        # if config is None:
+        #     config = CrawlerRunConfig(
+        #         word_count_threshold=word_count_threshold,
+        #         extraction_strategy=extraction_strategy,
+        #         chunking_strategy=chunking_strategy,
+        #         content_filter=content_filter,
+        #         cache_mode=cache_mode,
+        #         bypass_cache=bypass_cache,
+        #         css_selector=css_selector,
+        #         screenshot=screenshot,
+        #         pdf=pdf,
+        #         verbose=verbose,
+        #         **kwargs,
+        #     )
+
+        if dispatcher is None:
+            dispatcher = MemoryAdaptiveDispatcher(
+                rate_limiter=RateLimiter(
+                    base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3
+                ),
+            )
+
+        def transform_result(task_result):
+            return (
+                setattr(
+                    task_result.result,
+                    "dispatch_result",
+                    DispatchResult(
+                        task_id=task_result.task_id,
+                        memory_usage=task_result.memory_usage,
+                        peak_memory=task_result.peak_memory,
+                        start_time=task_result.start_time,
+                        end_time=task_result.end_time,
+                        error_message=task_result.error_message,
+                    ),
+                )
+                or task_result.result
+            )
+
+        stream = config.stream
+
+        if stream:
+
+            async def result_transformer():
+                async for task_result in dispatcher.run_urls_stream(
+                    crawler=self, urls=urls, config=config
+                ):
+                    yield transform_result(task_result)
+
+            return result_transformer()
+        else:
+            _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
+            return [transform_result(res) for res in _results]
+
+```
+
+
+## File: crawl4ai/cli.py
+
+```py
+import click
+import os
+import sys
+import time
+
+import humanize
+from typing import Dict, Any, Optional, List
+import json
+import yaml
+import anyio
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from rich.prompt import Prompt, Confirm
+
+from crawl4ai import (
+    CacheMode,
+    AsyncWebCrawler, 
+    CrawlResult,
+    BrowserConfig, 
+    CrawlerRunConfig,
+    LLMExtractionStrategy, 
+    LXMLWebScrapingStrategy,
+    JsonCssExtractionStrategy,
+    JsonXPathExtractionStrategy,
+    BM25ContentFilter, 
+    PruningContentFilter,
+    BrowserProfiler,
+    DefaultMarkdownGenerator,
+    LLMConfig
+)
+from crawl4ai.config import USER_SETTINGS
+from litellm import completion
+from pathlib import Path
+
+
+# Initialize rich console
+console = Console()
+
+def get_global_config() -> dict:
+    config_dir = Path.home() / ".crawl4ai"
+    config_file = config_dir / "global.yml"
+    
+    if not config_file.exists():
+        config_dir.mkdir(parents=True, exist_ok=True)
+        return {}
+        
+    with open(config_file) as f:
+        return yaml.safe_load(f) or {}
+
+def save_global_config(config: dict):
+    config_file = Path.home() / ".crawl4ai" / "global.yml"
+    with open(config_file, "w") as f:
+        yaml.dump(config, f)
+
+def setup_llm_config() -> tuple[str, str]:
+    config = get_global_config()
+    provider = config.get("DEFAULT_LLM_PROVIDER")
+    token = config.get("DEFAULT_LLM_PROVIDER_TOKEN")
+    
+    if not provider:
+        click.echo("\nNo default LLM provider configured.")
+        click.echo("Provider format: 'company/model' (e.g., 'openai/gpt-4o', 'anthropic/claude-3-sonnet')")
+        click.echo("See available providers at: https://docs.litellm.ai/docs/providers")
+        provider = click.prompt("Enter provider")
+        
+    if not provider.startswith("ollama/"):
+        if not token:
+            token = click.prompt("Enter API token for " + provider, hide_input=True)
+    else:
+        token = "no-token"
+    
+    if not config.get("DEFAULT_LLM_PROVIDER") or not config.get("DEFAULT_LLM_PROVIDER_TOKEN"):
+        config["DEFAULT_LLM_PROVIDER"] = provider
+        config["DEFAULT_LLM_PROVIDER_TOKEN"] = token
+        save_global_config(config)
+        click.echo("\nConfiguration saved to ~/.crawl4ai/global.yml")
+    
+    return provider, token
+
+async def stream_llm_response(url: str, markdown: str, query: str, provider: str, token: str):
+    response = completion(
+        model=provider,
+        api_key=token,
+        messages=[
+            {
+                "content": f"You are Crawl4ai assistant, answering user question based on the provided context which is crawled from {url}.",
+                "role": "system"
+            },
+            {
+                "content": f"<|start of context|>\n{markdown}\n<|end of context|>\n\n{query}",
+                "role": "user"
+            },
+        ],
+        stream=True,
+    )
+    
+    for chunk in response:
+        if content := chunk["choices"][0]["delta"].get("content"):
+            print(content, end="", flush=True)
+    print()  # New line at end
+
+
+
+def parse_key_values(ctx, param, value) -> Dict[str, Any]:
+    if not value:
+        return {}
+    result = {}
+    pairs = value.split(',')
+    for pair in pairs:
+        try:
+            k, v = pair.split('=', 1)
+            # Handle common value types 
+            if v.lower() == 'true': v = True
+            elif v.lower() == 'false': v = False
+            elif v.isdigit(): v = int(v)
+            elif v.replace('.','',1).isdigit(): v = float(v)
+            elif v.startswith('[') and v.endswith(']'):
+                v = [x.strip() for x in v[1:-1].split(',') if x.strip()]
+            elif v.startswith('{') and v.endswith('}'):
+                try:
+                    v = json.loads(v)
+                except json.JSONDecodeError:
+                    raise click.BadParameter(f'Invalid JSON object: {v}')
+            result[k.strip()] = v
+        except ValueError:
+            raise click.BadParameter(f'Invalid key=value pair: {pair}')
+    return result
+
+def load_config_file(path: Optional[str]) -> dict:
+    if not path:
+        return {}
+    
+    try:
+        with open(path) as f:
+            if path.endswith((".yaml", ".yml")):
+                return yaml.safe_load(f)
+            return json.load(f)
+    except Exception as e:
+        raise click.BadParameter(f'Error loading config file {path}: {str(e)}')
+
+def load_schema_file(path: Optional[str]) -> dict:
+    if not path:
+        return None
+    return load_config_file(path)
+
+async def run_crawler(url: str, browser_cfg: BrowserConfig, crawler_cfg: CrawlerRunConfig, verbose: bool):
+    if verbose:
+        click.echo("Starting crawler with configurations:")
+        click.echo(f"Browser config: {browser_cfg.dump()}")
+        click.echo(f"Crawler config: {crawler_cfg.dump()}")
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        try:
+            result = await crawler.arun(url=url, config=crawler_cfg)
+            return result
+        except Exception as e:
+            raise click.ClickException(f"Crawling failed: {str(e)}")
+
+def show_examples():
+    examples = """
+🚀 Crawl4AI CLI Examples
+
+1️⃣  Basic Usage:
+    # Simple crawl with default settings
+    crwl https://example.com
+
+    # Get markdown output
+    crwl https://example.com -o markdown
+
+    # Verbose JSON output with cache bypass
+    crwl https://example.com -o json -v --bypass-cache
+
+2️⃣  Using Config Files:
+    # Using browser and crawler configs
+    crwl https://example.com -B browser.yml -C crawler.yml
+
+    # CSS-based extraction
+    crwl https://example.com -e extract_css.yml -s css_schema.json -o json
+
+    # LLM-based extraction with config file
+    crwl https://example.com -e extract_llm.yml -s llm_schema.json -o json
+    
+    # Quick LLM-based JSON extraction (prompts for LLM provider first time)
+    crwl https://example.com -j  # Auto-extracts structured data
+    crwl https://example.com -j "Extract product details including name, price, and features"  # With specific instructions
+
+3️⃣  Direct Parameters:
+    # Browser settings
+    crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
+
+    # Crawler settings
+    crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
+
+4️⃣  Profile Management for Identity-Based Crawling:
+    # Launch interactive profile manager
+    crwl profiles
+
+    # Create, list, and delete browser profiles for identity-based crawling
+    # Use a profile for crawling (keeps you logged in)
+    crwl https://example.com -p my-profile-name
+
+    # Example: Crawl a site that requires login
+    # 1. First create a profile and log in:
+    crwl profiles
+    # 2. Then use that profile to crawl the authenticated site:
+    crwl https://site-requiring-login.com/dashboard -p my-profile-name
+
+5️⃣  CDP Mode for Browser Automation:
+    # Launch browser with CDP debugging on default port 9222
+    crwl cdp
+
+    # Use a specific profile and custom port
+    crwl cdp -p my-profile -P 9223
+
+    # Launch headless browser with CDP enabled
+    crwl cdp --headless
+
+    # Launch in incognito mode (ignores profile)
+    crwl cdp --incognito
+
+    # Use the CDP URL with other tools (Puppeteer, Playwright, etc.)
+    # The URL will be displayed in the terminal when the browser starts
+
+    
+6️⃣  Sample Config Files:
+
+browser.yml:
+    headless: true
+    viewport_width: 1280
+    user_agent_mode: "random"
+    verbose: true
+    ignore_https_errors: true
+
+extract_css.yml:
+    type: "json-css"
+    params:
+        verbose: true
+
+css_schema.json:
+    {
+      "name": "ArticleExtractor",
+      "baseSelector": ".article",
+      "fields": [
+        {
+          "name": "title",
+          "selector": "h1.title",
+          "type": "text"
+        },
+        {
+          "name": "link",
+          "selector": "a.read-more",
+          "type": "attribute",
+          "attribute": "href"
+        }
+      ]
+    }
+
+extract_llm.yml:
+    type: "llm"
+    provider: "openai/gpt-4"
+    instruction: "Extract all articles with their titles and links"
+    api_token: "your-token"
+    params:
+        temperature: 0.3
+        max_tokens: 1000
+
+llm_schema.json:
+    {
+      "title": "Article",
+      "type": "object",
+      "properties": {
+        "title": {
+          "type": "string",
+          "description": "The title of the article"
+        },
+        "link": {
+          "type": "string",
+          "description": "URL to the full article"
+        }
+      }
+    }
+
+7️⃣  Advanced Usage:
+    # Combine configs with direct parameters
+    crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
+
+    # Full extraction pipeline with config files
+    crwl https://example.com \\
+        -B browser.yml \\
+        -C crawler.yml \\
+        -e extract_llm.yml \\
+        -s llm_schema.json \\
+        -o json \\
+        -v
+        
+    # Quick LLM-based extraction with specific instructions
+    crwl https://amazon.com/dp/B01DFKC2SO \\
+        -j "Extract product title, current price, original price, rating, and all product specifications" \\
+        -b "headless=true,viewport_width=1280" \\
+        -v
+
+    # Content filtering with BM25
+    crwl https://example.com \\
+        -f filter_bm25.yml \\
+        -o markdown-fit
+
+    # Authenticated crawling with profile
+    crwl https://login-required-site.com \\
+        -p my-authenticated-profile \\
+        -c "css_selector=.dashboard-content" \\
+        -o markdown
+
+For more documentation visit: https://github.com/unclecode/crawl4ai
+
+8️⃣  Q&A with LLM:
+    # Ask a question about the content
+    crwl https://example.com -q "What is the main topic discussed?"
+
+    # First view content, then ask questions
+    crwl https://example.com -o markdown  # See the crawled content first
+    crwl https://example.com -q "Summarize the key points"
+    crwl https://example.com -q "What are the conclusions?"
+
+    # Advanced crawling with Q&A
+    crwl https://example.com \\
+        -B browser.yml \\
+        -c "css_selector=article,scan_full_page=true" \\
+        -q "What are the pros and cons mentioned?"
+
+    Note: First time using -q will prompt for LLM provider and API token.
+    These will be saved in ~/.crawl4ai/global.yml for future use.
+    
+    Supported provider format: 'company/model'
+    Examples:
+      - ollama/llama3.3
+      - openai/gpt-4
+      - anthropic/claude-3-sonnet
+      - cohere/command
+      - google/gemini-pro
+    
+    See full list of providers: https://docs.litellm.ai/docs/providers
+    
+    # Set default LLM provider and token in advance
+    crwl config set DEFAULT_LLM_PROVIDER "anthropic/claude-3-sonnet"
+    crwl config set DEFAULT_LLM_PROVIDER_TOKEN "your-api-token-here"
+    
+    # Set default browser behavior
+    crwl config set BROWSER_HEADLESS false  # Always show browser window
+    crwl config set USER_AGENT_MODE random  # Use random user agent
+
+9️⃣ Profile Management:
+    # Launch interactive profile manager
+    crwl profiles
+
+    # Create a profile and use it for crawling
+    crwl profiles  # Create and set up your profile interactively
+    crwl https://example.com -p my-profile-name  # Use profile for crawling
+
+    # Example workflow for authenticated site
+    # 1. First create a profile and log in to the site:
+    crwl profiles  # Select "Create new profile" option
+    # 2. Then use that profile to crawl authenticated content:
+    crwl https://site-requiring-login.com/dashboard -p my-profile-name
+
+🔄 Builtin Browser Management:
+    # Start a builtin browser (runs in the background)
+    crwl browser start
+    
+    # Check builtin browser status
+    crwl browser status
+    
+    # Open a visible window to see the browser
+    crwl browser view --url https://example.com
+    
+    # Stop the builtin browser
+    crwl browser stop
+    
+    # Restart with different options
+    crwl browser restart --browser-type chromium --port 9223 --no-headless
+    
+    # Use the builtin browser in your code
+    # (Just set browser_mode="builtin" in your BrowserConfig)
+    browser_config = BrowserConfig(
+        browser_mode="builtin", 
+        headless=True
+    )
+    
+    # Usage via CLI:
+    crwl https://example.com -b "browser_mode=builtin"
+"""
+    click.echo(examples)
+
+def get_directory_size(path: str) -> int:
+    """Calculate the total size of a directory in bytes"""
+    total_size = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if not os.path.islink(fp):
+                total_size += os.path.getsize(fp)
+    return total_size
+
+def display_profiles_table(profiles: List[Dict[str, Any]]):
+    """Display a rich table of browser profiles"""
+    if not profiles:
+        console.print(Panel("[yellow]No profiles found. Create one with the 'create' command.[/yellow]", 
+                          title="Browser Profiles", border_style="blue"))
+        return
+    
+    table = Table(title="Browser Profiles", show_header=True, header_style="bold cyan", border_style="blue")
+    table.add_column("#", style="dim", width=4)
+    table.add_column("Name", style="cyan", no_wrap=True)
+    table.add_column("Path", style="green")
+    table.add_column("Created", style="yellow")
+    table.add_column("Browser", style="magenta")
+    table.add_column("Size", style="blue", justify="right")
+    
+    for i, profile in enumerate(profiles):
+        # Calculate folder size
+        size = get_directory_size(profile["path"])
+        human_size = humanize.naturalsize(size)
+        
+        # Format creation date
+        created = profile["created"].strftime("%Y-%m-%d %H:%M")
+        
+        # Add row to table
+        table.add_row(
+            str(i+1), 
+            profile["name"], 
+            profile["path"], 
+            created, 
+            profile["type"].capitalize(), 
+            human_size
+        )
+    
+    console.print(table)
+
+async def create_profile_interactive(profiler: BrowserProfiler):
+    """Interactive profile creation wizard"""
+    console.print(Panel("[bold cyan]Create Browser Profile[/bold cyan]\n"
+                      "This will open a browser window for you to set up your identity.\n"
+                      "Log in to sites, adjust settings, then press 'q' to save.",
+                      border_style="cyan"))
+    
+    profile_name = Prompt.ask("[cyan]Enter profile name[/cyan]", default=f"profile_{int(time.time())}")
+    
+    console.print("[cyan]Creating profile...[/cyan]")
+    console.print("[yellow]A browser window will open. After logging in to sites, press 'q' in this terminal to save.[/yellow]")
+    
+    # Create the profile
+    try:
+        profile_path = await profiler.create_profile(profile_name)
+        
+        if profile_path:
+            console.print(f"[green]Profile successfully created at:[/green] {profile_path}")
+        else:
+            console.print("[red]Failed to create profile.[/red]")
+    except Exception as e:
+        console.print(f"[red]Error creating profile: {str(e)}[/red]")
+
+def delete_profile_interactive(profiler: BrowserProfiler):
+    """Interactive profile deletion"""
+    profiles = profiler.list_profiles()
+    
+    if not profiles:
+        console.print("[yellow]No profiles found to delete.[/yellow]")
+        return
+    
+    # Display profiles
+    display_profiles_table(profiles)
+    
+    # Get profile selection
+    idx = Prompt.ask(
+        "[red]Enter number of profile to delete[/red]", 
+        console=console,
+        choices=[str(i+1) for i in range(len(profiles))],
+        show_choices=False
+    )
+    
+    try:
+        idx = int(idx) - 1
+        profile = profiles[idx]
+        
+        # Confirm deletion
+        if Confirm.ask(f"[red]Are you sure you want to delete profile '{profile['name']}'?[/red]"):
+            success = profiler.delete_profile(profile["path"])
+            
+            if success:
+                console.print(f"[green]Profile '{profile['name']}' deleted successfully.[/green]")
+            else:
+                console.print(f"[red]Failed to delete profile '{profile['name']}'.[/red]")
+    except (ValueError, IndexError):
+        console.print("[red]Invalid selection.[/red]")
+        
+async def crawl_with_profile_cli(profile_path, url):
+    """Use a profile to crawl a website via CLI"""
+    console.print(f"[cyan]Crawling [bold]{url}[/bold] using profile at [bold]{profile_path}[/bold][/cyan]")
+    
+    # Create browser config with the profile
+    browser_cfg = BrowserConfig(
+        headless=False,  # Set to False to see the browser in action
+        use_managed_browser=True,
+        user_data_dir=profile_path
+    )
+    
+    # Default crawler config
+    crawler_cfg = CrawlerRunConfig()
+    
+    # Ask for output format
+    output_format = Prompt.ask(
+        "[cyan]Output format[/cyan]",
+        choices=["all", "json", "markdown", "md", "title"],
+        default="markdown"
+    )
+    
+    try:
+        # Run the crawler
+        result = await run_crawler(url, browser_cfg, crawler_cfg, True)
+        
+        # Handle output
+        if output_format == "all":
+            console.print(json.dumps(result.model_dump(), indent=2))
+        elif output_format == "json":
+            console.print(json.dumps(json.loads(result.extracted_content), indent=2))
+        elif output_format in ["markdown", "md"]:
+            console.print(result.markdown.raw_markdown)
+        elif output_format == "title":
+            console.print(result.metadata.get("title", "No title found"))
+        
+        console.print(f"[green]Successfully crawled[/green] {url}")
+        return result
+    except Exception as e:
+        console.print(f"[red]Error crawling:[/red] {str(e)}")
+        return None
+        
+async def use_profile_to_crawl():
+    """Interactive profile selection for crawling"""
+    profiler = BrowserProfiler()
+    profiles = profiler.list_profiles()
+    
+    if not profiles:
+        console.print("[yellow]No profiles found. Create one first.[/yellow]")
+        return
+        
+    # Display profiles
+    display_profiles_table(profiles)
+    
+    # Get profile selection
+    idx = Prompt.ask(
+        "[cyan]Enter number of profile to use[/cyan]", 
+        console=console,
+        choices=[str(i+1) for i in range(len(profiles))],
+        show_choices=False
+    )
+    
+    try:
+        idx = int(idx) - 1
+        profile = profiles[idx]
+        
+        # Get URL
+        url = Prompt.ask("[cyan]Enter URL to crawl[/cyan]")
+        if url:
+            # Crawl with the selected profile
+            await crawl_with_profile_cli(profile["path"], url)
+        else:
+            console.print("[red]No URL provided[/red]")
+    except (ValueError, IndexError):
+        console.print("[red]Invalid selection[/red]")
+
+async def manage_profiles():
+    """Interactive profile management menu"""
+    profiler = BrowserProfiler()
+    
+    options = {
+        "1": "List profiles",
+        "2": "Create new profile",
+        "3": "Delete profile",
+        "4": "Use a profile to crawl a website",
+        "5": "Exit",
+    }
+    
+    while True:
+        console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan"))
+        
+        for key, value in options.items():
+            color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "blue" if key == "4" else "cyan"
+            console.print(f"[{color}]{key}[/{color}]. {value}")
+        
+        choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1")
+        
+        if choice == "1":
+            # List profiles
+            profiles = profiler.list_profiles()
+            display_profiles_table(profiles)
+        
+        elif choice == "2":
+            # Create profile
+            await create_profile_interactive(profiler)
+        
+        elif choice == "3":
+            # Delete profile
+            delete_profile_interactive(profiler)
+            
+        elif choice == "4":
+            # Use profile to crawl
+            await use_profile_to_crawl()
+        
+        elif choice == "5":
+            # Exit
+            console.print("[cyan]Exiting profile manager.[/cyan]")
+            break
+        
+        # Add a separator between operations
+        console.print("\n")
+
+
+
+@click.group(context_settings={"help_option_names": ["-h", "--help"]})
+def cli():
+    """Crawl4AI CLI - Web content extraction and browser profile management tool"""
+    pass
+
+
+@cli.group("browser")
+def browser_cmd():
+    """Manage browser instances for Crawl4AI
+    
+    Commands to manage browser instances for Crawl4AI, including:
+    - status - Check status of the builtin browser
+    - start - Start a new builtin browser
+    - stop - Stop the running builtin browser
+    - restart - Restart the builtin browser
+    """
+    pass
+    
+@browser_cmd.command("status")
+def browser_status_cmd():
+    """Show status of the builtin browser"""
+    profiler = BrowserProfiler()
+    
+    try:
+        status = anyio.run(profiler.get_builtin_browser_status)
+        
+        if status["running"]:
+            info = status["info"]
+            console.print(Panel(
+                f"[green]Builtin browser is running[/green]\n\n"
+                f"CDP URL: [cyan]{info['cdp_url']}[/cyan]\n"
+                f"Process ID: [yellow]{info['pid']}[/yellow]\n"
+                f"Browser type: [blue]{info['browser_type']}[/blue]\n"
+                f"User data directory: [magenta]{info['user_data_dir']}[/magenta]\n"
+                f"Started: [cyan]{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(info['start_time']))}[/cyan]",
+                title="Builtin Browser Status",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[yellow]Builtin browser is not running[/yellow]\n\n"
+                "Use 'crwl browser start' to start a builtin browser",
+                title="Builtin Browser Status",
+                border_style="yellow"
+            ))
+            
+    except Exception as e:
+        console.print(f"[red]Error checking browser status: {str(e)}[/red]")
+        sys.exit(1)
+        
+@browser_cmd.command("start")
+@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium", 
+              help="Browser type (default: chromium)")
+@click.option("--port", "-p", type=int, default=9222, help="Debugging port (default: 9222)")
+@click.option("--headless/--no-headless", default=True, help="Run browser in headless mode")
+def browser_start_cmd(browser_type: str, port: int, headless: bool):
+    """Start a builtin browser instance
+    
+    This will start a persistent browser instance that can be used by Crawl4AI
+    by setting browser_mode="builtin" in BrowserConfig.
+    """
+    profiler = BrowserProfiler()
+    
+    # First check if browser is already running
+    status = anyio.run(profiler.get_builtin_browser_status)
+    if status["running"]:
+        console.print(Panel(
+            "[yellow]Builtin browser is already running[/yellow]\n\n"
+            f"CDP URL: [cyan]{status['cdp_url']}[/cyan]\n\n"
+            "Use 'crwl browser restart' to restart the browser",
+            title="Builtin Browser Start",
+            border_style="yellow"
+        ))
+        return
+    
+    try:
+        console.print(Panel(
+            f"[cyan]Starting builtin browser[/cyan]\n\n"
+            f"Browser type: [green]{browser_type}[/green]\n"
+            f"Debugging port: [yellow]{port}[/yellow]\n"
+            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
+            title="Builtin Browser Start",
+            border_style="cyan"
+        ))
+        
+        cdp_url = anyio.run(
+            profiler.launch_builtin_browser,
+            browser_type,
+            port,
+            headless
+        )
+        
+        if cdp_url:
+            console.print(Panel(
+                f"[green]Builtin browser started successfully[/green]\n\n"
+                f"CDP URL: [cyan]{cdp_url}[/cyan]\n\n"
+                "This browser will be used automatically when setting browser_mode='builtin'",
+                title="Builtin Browser Start",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[red]Failed to start builtin browser[/red]",
+                title="Builtin Browser Start",
+                border_style="red"
+            ))
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error starting builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+        
+@browser_cmd.command("stop")
+def browser_stop_cmd():
+    """Stop the running builtin browser"""
+    profiler = BrowserProfiler()
+    
+    try:
+        # First check if browser is running
+        status = anyio.run(profiler.get_builtin_browser_status)
+        if not status["running"]:
+            console.print(Panel(
+                "[yellow]No builtin browser is currently running[/yellow]",
+                title="Builtin Browser Stop",
+                border_style="yellow"
+            ))
+            return
+            
+        console.print(Panel(
+            "[cyan]Stopping builtin browser...[/cyan]",
+            title="Builtin Browser Stop", 
+            border_style="cyan"
+        ))
+        
+        success = anyio.run(profiler.kill_builtin_browser)
+        
+        if success:
+            console.print(Panel(
+                "[green]Builtin browser stopped successfully[/green]",
+                title="Builtin Browser Stop",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[red]Failed to stop builtin browser[/red]",
+                title="Builtin Browser Stop",
+                border_style="red"
+            ))
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error stopping builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+        
+@browser_cmd.command("view")
+@click.option("--url", "-u", help="URL to navigate to (defaults to about:blank)")
+def browser_view_cmd(url: Optional[str]):
+    """
+    Open a visible window of the builtin browser
+    
+    This command connects to the running builtin browser and opens a visible window,
+    allowing you to see what the browser is currently viewing or navigate to a URL.
+    """
+    profiler = BrowserProfiler()
+    
+    try:
+        # First check if browser is running
+        status = anyio.run(profiler.get_builtin_browser_status)
+        if not status["running"]:
+            console.print(Panel(
+                "[yellow]No builtin browser is currently running[/yellow]\n\n"
+                "Use 'crwl browser start' to start a builtin browser first",
+                title="Builtin Browser View",
+                border_style="yellow"
+            ))
+            return
+        
+        info = status["info"]
+        cdp_url = info["cdp_url"]
+        
+        console.print(Panel(
+            f"[cyan]Opening visible window connected to builtin browser[/cyan]\n\n"
+            f"CDP URL: [green]{cdp_url}[/green]\n"
+            f"URL to load: [yellow]{url or 'about:blank'}[/yellow]",
+            title="Builtin Browser View",
+            border_style="cyan"
+        ))
+        
+        # Use the CDP URL to launch a new visible window
+        import subprocess
+        import os
+        
+        # Determine the browser command based on platform
+        if sys.platform == "darwin":  # macOS
+            browser_cmd = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"]
+        elif sys.platform == "win32":  # Windows
+            browser_cmd = ["C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"]
+        else:  # Linux
+            browser_cmd = ["google-chrome"]
+        
+        # Add arguments
+        browser_args = [
+            f"--remote-debugging-port={info['debugging_port']}",
+            "--remote-debugging-address=localhost",
+            "--no-first-run",
+            "--no-default-browser-check"
+        ]
+        
+        # Add URL if provided
+        if url:
+            browser_args.append(url)
+        
+        # Launch browser
+        try:
+            subprocess.Popen(browser_cmd + browser_args)
+            console.print("[green]Browser window opened. Close it when finished viewing.[/green]")
+        except Exception as e:
+            console.print(f"[red]Error launching browser: {str(e)}[/red]")
+            console.print(f"[yellow]Try connecting manually to {cdp_url} in Chrome or using the '--remote-debugging-port' flag.[/yellow]")
+    
+    except Exception as e:
+        console.print(f"[red]Error viewing builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+
+@browser_cmd.command("restart")
+@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default=None, 
+              help="Browser type (defaults to same as current)")
+@click.option("--port", "-p", type=int, default=None, help="Debugging port (defaults to same as current)")
+@click.option("--headless/--no-headless", default=None, help="Run browser in headless mode")
+def browser_restart_cmd(browser_type: Optional[str], port: Optional[int], headless: Optional[bool]):
+    """Restart the builtin browser
+    
+    Stops the current builtin browser if running and starts a new one.
+    By default, uses the same configuration as the current browser.
+    """
+    profiler = BrowserProfiler()
+    
+    try:
+        # First check if browser is running and get its config
+        status = anyio.run(profiler.get_builtin_browser_status)
+        current_config = {}
+        
+        if status["running"]:
+            info = status["info"]
+            current_config = {
+                "browser_type": info["browser_type"],
+                "port": info["debugging_port"],
+                "headless": True  # Default assumption
+            }
+            
+            # Stop the browser
+            console.print(Panel(
+                "[cyan]Stopping current builtin browser...[/cyan]",
+                title="Builtin Browser Restart", 
+                border_style="cyan"
+            ))
+            
+            success = anyio.run(profiler.kill_builtin_browser)
+            if not success:
+                console.print(Panel(
+                    "[red]Failed to stop current browser[/red]",
+                    title="Builtin Browser Restart",
+                    border_style="red"
+                ))
+                sys.exit(1)
+        
+        # Use provided options or defaults from current config
+        browser_type = browser_type or current_config.get("browser_type", "chromium")
+        port = port or current_config.get("port", 9222)
+        headless = headless if headless is not None else current_config.get("headless", True)
+        
+        # Start a new browser
+        console.print(Panel(
+            f"[cyan]Starting new builtin browser[/cyan]\n\n"
+            f"Browser type: [green]{browser_type}[/green]\n"
+            f"Debugging port: [yellow]{port}[/yellow]\n"
+            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
+            title="Builtin Browser Restart",
+            border_style="cyan"
+        ))
+        
+        cdp_url = anyio.run(
+            profiler.launch_builtin_browser,
+            browser_type,
+            port,
+            headless
+        )
+        
+        if cdp_url:
+            console.print(Panel(
+                f"[green]Builtin browser restarted successfully[/green]\n\n"
+                f"CDP URL: [cyan]{cdp_url}[/cyan]",
+                title="Builtin Browser Restart",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[red]Failed to restart builtin browser[/red]",
+                title="Builtin Browser Restart",
+                border_style="red"
+            ))
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error restarting builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+
+@cli.command("cdp")
+@click.option("--user-data-dir", "-d", help="Directory to use for browser data (will be created if it doesn't exist)")
+@click.option("--port", "-P", type=int, default=9222, help="Debugging port (default: 9222)")
+@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium", 
+              help="Browser type (default: chromium)")
+@click.option("--headless", is_flag=True, help="Run browser in headless mode")
+@click.option("--incognito", is_flag=True, help="Run in incognito/private mode (ignores user-data-dir)")
+def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless: bool, incognito: bool):
+    """Launch a standalone browser with CDP debugging enabled
+    
+    This command launches a browser with Chrome DevTools Protocol (CDP) debugging enabled,
+    prints the CDP URL, and keeps the browser running until you press 'q'.
+    
+    The CDP URL can be used for various automation and debugging tasks.
+    
+    Examples:
+        # Launch Chromium with CDP on default port 9222
+        crwl cdp
+        
+        # Use a specific directory for browser data and custom port
+        crwl cdp --user-data-dir ~/browser-data --port 9223
+        
+        # Launch in headless mode
+        crwl cdp --headless
+        
+        # Launch in incognito mode (ignores user-data-dir)
+        crwl cdp --incognito
+    """
+    profiler = BrowserProfiler()
+    
+    try:
+        # Handle data directory
+        data_dir = None
+        if not incognito and user_data_dir:
+            # Expand user path (~/something)
+            expanded_path = os.path.expanduser(user_data_dir)
+            
+            # Create directory if it doesn't exist
+            if not os.path.exists(expanded_path):
+                console.print(f"[yellow]Directory '{expanded_path}' doesn't exist. Creating it.[/yellow]")
+                os.makedirs(expanded_path, exist_ok=True)
+            
+            data_dir = expanded_path
+        
+        # Print launch info
+        console.print(Panel(
+            f"[cyan]Launching browser with CDP debugging[/cyan]\n\n"
+            f"Browser type: [green]{browser_type}[/green]\n"
+            f"Debugging port: [yellow]{port}[/yellow]\n"
+            f"User data directory: [cyan]{data_dir or 'Temporary directory'}[/cyan]\n"
+            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]\n"
+            f"Incognito: [cyan]{'Yes' if incognito else 'No'}[/cyan]\n\n"
+            f"[yellow]Press 'q' to quit when done[/yellow]",
+            title="CDP Browser",
+            border_style="cyan"
+        ))
+        
+        # Run the browser
+        cdp_url = anyio.run(
+            profiler.launch_standalone_browser,
+            browser_type,
+            data_dir,
+            port,
+            headless
+        )
+        
+        if not cdp_url:
+            console.print("[red]Failed to launch browser or get CDP URL[/red]")
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error launching CDP browser: {str(e)}[/red]")
+        sys.exit(1)
+
+
+@cli.command("crawl")
+@click.argument("url", required=True)
+@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
+@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
+@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
+@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
+@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description")
+@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
+@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
+@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
+@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
+@click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
+@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--question", "-q", help="Ask a question about the crawled content")
+@click.option("--verbose", "-v", is_flag=True)
+@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
+def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, 
+           extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
+           output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+    """Crawl a website and extract content
+    
+    Simple Usage:
+        crwl crawl https://example.com
+    """
+    
+    # Handle profile option
+    if profile:
+        profiler = BrowserProfiler()
+        profile_path = profiler.get_profile_path(profile)
+        
+        if not profile_path:
+            profiles = profiler.list_profiles()
+            
+            if profiles:
+                console.print(f"[red]Profile '{profile}' not found. Available profiles:[/red]")
+                display_profiles_table(profiles)
+            else:
+                console.print("[red]No profiles found. Create one with 'crwl profiles'[/red]")
+            
+            return
+        
+        # Include the profile in browser config
+        if not browser:
+            browser = {}
+        browser["user_data_dir"] = profile_path
+        browser["use_managed_browser"] = True
+        
+        if verbose:
+            console.print(f"[green]Using browser profile:[/green] {profile}")
+            
+    try:
+        # Load base configurations
+        browser_cfg = BrowserConfig.load(load_config_file(browser_config))
+        crawler_cfg = CrawlerRunConfig.load(load_config_file(crawler_config))
+        
+        # Override with CLI params
+        if browser:
+            browser_cfg = browser_cfg.clone(**browser)
+        if crawler:
+            crawler_cfg = crawler_cfg.clone(**crawler)
+            
+        # Handle content filter config
+        if filter_config or output in ["markdown-fit", "md-fit"]:
+            if filter_config:
+                filter_conf = load_config_file(filter_config)
+            elif not filter_config and output in ["markdown-fit", "md-fit"]:
+                filter_conf = {
+                    "type": "pruning",
+                    "query": "",
+                    "threshold": 0.48
+                }
+            if filter_conf["type"] == "bm25":
+                crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
+                    content_filter = BM25ContentFilter(
+                        user_query=filter_conf.get("query"),
+                        bm25_threshold=filter_conf.get("threshold", 1.0)
+                    )
+                )
+            elif filter_conf["type"] == "pruning":
+                crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
+                    content_filter = PruningContentFilter(
+                        user_query=filter_conf.get("query"),
+                        threshold=filter_conf.get("threshold", 0.48)
+                    )
+                )
+        
+        # Handle json-extract option (takes precedence over extraction-config)
+        if json_extract is not None:
+            # Get LLM provider and token
+            provider, token = setup_llm_config()
+            
+            # Default sophisticated instruction for structured data extraction
+            default_instruction = """Analyze the web page content and extract structured data as JSON. 
+If the page contains a list of items with repeated patterns, extract all items in an array. 
+If the page is an article or contains unique content, extract a comprehensive JSON object with all relevant information.
+Look at the content, intention of content, what it offers and find the data item(s) in the page.
+Always return valid, properly formatted JSON."""
+            
+            
+            default_instruction_with_user_query = """Analyze the web page content and extract structured data as JSON, following the below instruction and explanation of schema and always return valid, properly formatted JSON. \n\nInstruction:\n\n""" + json_extract
+            
+            # Determine instruction based on whether json_extract is empty or has content
+            instruction = default_instruction_with_user_query if json_extract else default_instruction
+            
+            # Create LLM extraction strategy
+            crawler_cfg.extraction_strategy = LLMExtractionStrategy(
+                llm_config=LLMConfig(provider=provider, api_token=token),
+                instruction=instruction,
+                schema=load_schema_file(schema),  # Will be None if no schema is provided
+                extraction_type="schema", #if schema else "block",
+                apply_chunking=False,
+                force_json_response=True,
+                verbose=verbose,
+            )
+            
+            # Set output to JSON if not explicitly specified
+            if output == "all":
+                output = "json"
+                
+        # Handle extraction strategy from config file (only if json-extract wasn't used)
+        elif extraction_config:
+            extract_conf = load_config_file(extraction_config)
+            schema_data = load_schema_file(schema)
+            
+            # Check if type does not exist show proper message
+            if not extract_conf.get("type"):
+                raise click.ClickException("Extraction type not specified")
+            if extract_conf["type"] not in ["llm", "json-css", "json-xpath"]:
+                raise click.ClickException(f"Invalid extraction type: {extract_conf['type']}")
+            
+            if extract_conf["type"] == "llm":
+                # if no provider show error emssage
+                if not extract_conf.get("provider") or not extract_conf.get("api_token"):
+                    raise click.ClickException("LLM provider and API token are required for LLM extraction")
+
+                crawler_cfg.extraction_strategy = LLMExtractionStrategy(
+                    llm_config=LLMConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
+                    instruction=extract_conf["instruction"],
+                    schema=schema_data,
+                    **extract_conf.get("params", {})
+                )
+            elif extract_conf["type"] == "json-css":
+                crawler_cfg.extraction_strategy = JsonCssExtractionStrategy(
+                    schema=schema_data
+                )
+            elif extract_conf["type"] == "json-xpath":
+                crawler_cfg.extraction_strategy = JsonXPathExtractionStrategy(
+                    schema=schema_data
+                )
+                
+
+        # No cache
+        if bypass_cache:
+            crawler_cfg.cache_mode = CacheMode.BYPASS
+
+        crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()    
+
+        config = get_global_config()
+        
+        browser_cfg.verbose = config.get("VERBOSE", False)
+        crawler_cfg.verbose = config.get("VERBOSE", False)
+        
+        # Run crawler
+        result : CrawlResult = anyio.run(
+            run_crawler,
+            url,
+            browser_cfg,
+            crawler_cfg,
+            verbose
+        )
+
+        # Handle question
+        if question:
+            provider, token = setup_llm_config()
+            markdown = result.markdown.raw_markdown
+            anyio.run(stream_llm_response, url, markdown, question, provider, token)
+            return
+        
+        # Handle output
+        if not output_file:
+            if output == "all":
+                click.echo(json.dumps(result.model_dump(), indent=2))
+            elif output == "json":
+                print(result.extracted_content)
+                extracted_items = json.loads(result.extracted_content)
+                click.echo(json.dumps(extracted_items, indent=2))
+                
+            elif output in ["markdown", "md"]:
+                click.echo(result.markdown.raw_markdown)
+            elif output in ["markdown-fit", "md-fit"]:
+                click.echo(result.markdown.fit_markdown)
+        else:
+            if output == "all":
+                with open(output_file, "w") as f:
+                    f.write(json.dumps(result.model_dump(), indent=2))
+            elif output == "json":
+                with open(output_file, "w") as f:
+                    f.write(result.extracted_content)
+            elif output in ["markdown", "md"]:
+                with open(output_file, "w") as f:
+                    f.write(result.markdown.raw_markdown)
+            elif output in ["markdown-fit", "md-fit"]:
+                with open(output_file, "w") as f:
+                    f.write(result.markdown.fit_markdown)
+            
+    except Exception as e:
+        raise click.ClickException(str(e))
+
+@cli.command("examples")
+def examples_cmd():
+    """Show usage examples"""
+    show_examples()
+
+@cli.group("config")
+def config_cmd():
+    """Manage global configuration settings
+    
+    Commands to view and update global configuration settings:
+    - list: Display all current configuration settings
+    - get: Get the value of a specific setting
+    - set: Set the value of a specific setting
+    """
+    pass
+
+@config_cmd.command("list")
+def config_list_cmd():
+    """List all configuration settings"""
+    config = get_global_config()
+    
+    table = Table(title="Crawl4AI Configuration", show_header=True, header_style="bold cyan", border_style="blue")
+    table.add_column("Setting", style="cyan")
+    table.add_column("Value", style="green")
+    table.add_column("Default", style="yellow")
+    table.add_column("Description", style="white")
+    
+    for key, setting in USER_SETTINGS.items():
+        value = config.get(key, setting["default"])
+        
+        # Handle secret values
+        display_value = value
+        if setting.get("secret", False) and value:
+            display_value = "********"
+            
+        # Handle boolean values
+        if setting["type"] == "boolean":
+            display_value = str(value).lower()
+            default_value = str(setting["default"]).lower()
+        else:
+            default_value = str(setting["default"])
+        
+        table.add_row(
+            key,
+            str(display_value),
+            default_value,
+            setting["description"]
+        )
+    
+    console.print(table)
+
+@config_cmd.command("get")
+@click.argument("key", required=True)
+def config_get_cmd(key: str):
+    """Get a specific configuration setting"""
+    config = get_global_config()
+    
+    # Normalize key to uppercase
+    key = key.upper()
+    
+    if key not in USER_SETTINGS:
+        console.print(f"[red]Error: Unknown setting '{key}'[/red]")
+        return
+    
+    value = config.get(key, USER_SETTINGS[key]["default"])
+    
+    # Handle secret values
+    display_value = value
+    if USER_SETTINGS[key].get("secret", False) and value:
+        display_value = "********"
+    
+    console.print(f"[cyan]{key}[/cyan] = [green]{display_value}[/green]")
+    console.print(f"[dim]Description: {USER_SETTINGS[key]['description']}[/dim]")
+
+@config_cmd.command("set")
+@click.argument("key", required=True)
+@click.argument("value", required=True)
+def config_set_cmd(key: str, value: str):
+    """Set a configuration setting"""
+    config = get_global_config()
+    
+    # Normalize key to uppercase
+    key = key.upper()
+    
+    if key not in USER_SETTINGS:
+        console.print(f"[red]Error: Unknown setting '{key}'[/red]")
+        console.print(f"[yellow]Available settings: {', '.join(USER_SETTINGS.keys())}[/yellow]")
+        return
+    
+    setting = USER_SETTINGS[key]
+    
+    # Type conversion and validation
+    if setting["type"] == "boolean":
+        if value.lower() in ["true", "yes", "1", "y"]:
+            typed_value = True
+        elif value.lower() in ["false", "no", "0", "n"]:
+            typed_value = False
+        else:
+            console.print(f"[red]Error: Invalid boolean value. Use 'true' or 'false'.[/red]")
+            return
+    elif setting["type"] == "string":
+        typed_value = value
+        
+        # Check if the value should be one of the allowed options
+        if "options" in setting and value not in setting["options"]:
+            console.print(f"[red]Error: Value must be one of: {', '.join(setting['options'])}[/red]")
+            return
+    
+    # Update config
+    config[key] = typed_value
+    save_global_config(config)
+    
+    # Handle secret values for display
+    display_value = typed_value
+    if setting.get("secret", False) and typed_value:
+        display_value = "********"
+        
+    console.print(f"[green]Successfully set[/green] [cyan]{key}[/cyan] = [green]{display_value}[/green]")
+
+@cli.command("profiles")
+def profiles_cmd():
+    """Manage browser profiles interactively
+    
+    Launch an interactive browser profile manager where you can:
+    - List all existing profiles
+    - Create new profiles for authenticated browsing
+    - Delete unused profiles
+    """
+    # Run interactive profile manager
+    anyio.run(manage_profiles)
+
+@cli.command(name="")
+@click.argument("url", required=False)
+@click.option("--example", is_flag=True, help="Show usage examples")
+@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
+@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
+@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
+@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
+@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description")
+@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
+@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
+@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
+@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
+@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--question", "-q", help="Ask a question about the crawled content")
+@click.option("--verbose", "-v", is_flag=True)
+@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
+def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, 
+        extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
+        output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+    """Crawl4AI CLI - Web content extraction tool
+
+    Simple Usage:
+        crwl https://example.com
+    
+    Run with --example to see detailed usage examples.
+    
+    Other commands:
+        crwl profiles   - Manage browser profiles for identity-based crawling
+        crwl crawl      - Crawl a website with advanced options
+        crwl cdp        - Launch browser with CDP debugging enabled
+        crwl browser    - Manage builtin browser (start, stop, status, restart)
+        crwl config     - Manage global configuration settings
+        crwl examples   - Show more usage examples
+        
+    Configuration Examples:
+        crwl config list                         - List all configuration settings
+        crwl config get DEFAULT_LLM_PROVIDER     - Show current LLM provider
+        crwl config set VERBOSE true             - Enable verbose mode globally
+        crwl config set BROWSER_HEADLESS false   - Default to visible browser
+    """
+
+    if example:
+        show_examples()
+        return
+        
+    if not url:
+        # Show help without error message
+        ctx = click.get_current_context()
+        click.echo(ctx.get_help())
+        return
+        
+    # Forward to crawl command
+    ctx = click.get_current_context()
+    ctx.invoke(
+        crawl_cmd, 
+        url=url, 
+        browser_config=browser_config,
+        crawler_config=crawler_config,
+        filter_config=filter_config,
+        extraction_config=extraction_config,
+        json_extract=json_extract,
+        schema=schema,
+        browser=browser,
+        crawler=crawler,
+        output=output,
+        bypass_cache=bypass_cache,
+        question=question,
+        verbose=verbose,
+        profile=profile
+    )
+
+def main():
+    import sys
+    if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
+        sys.argv.insert(1, "crawl")
+    cli()
+
+if __name__ == "__main__":
+    main()
+```
+
+
+## File: crawl4ai/extraction_strategy.py
+
+```py
+from abc import ABC, abstractmethod
+import inspect
+from typing import Any, List, Dict, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import json
+import time
+
+from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
+from .config import (
+    DEFAULT_PROVIDER,
+    DEFAULT_PROVIDER_API_KEY,
+    CHUNK_TOKEN_THRESHOLD,
+    OVERLAP_RATE,
+    WORD_TOKEN_RATE,
+)
+from .utils import *  # noqa: F403
+
+from .utils import (
+    sanitize_html,
+    escape_json_string,
+    perform_completion_with_backoff,
+    extract_xml_data,
+    split_and_parse_json_objects,
+    sanitize_input_encode,
+    merge_chunks,
+)
+from .models import * # noqa: F403
+
+from .models import TokenUsage
+
+from .model_loader import * # noqa: F403
+from .model_loader import (
+    get_device,
+    load_HF_embedding_model,
+    load_text_multilabel_classifier,
+    calculate_batch_size
+)
+
+from .types import LLMConfig, create_llm_config
+
+from functools import partial
+import numpy as np
+import re
+from bs4 import BeautifulSoup
+from lxml import html, etree
+
+
+class ExtractionStrategy(ABC):
+    """
+    Abstract base class for all extraction strategies.
+    """
+
+    def __init__(self, input_format: str = "markdown", **kwargs):
+        """
+        Initialize the extraction strategy.
+
+        Args:
+            input_format: Content format to use for extraction.
+                         Options: "markdown" (default), "html", "fit_markdown"
+            **kwargs: Additional keyword arguments
+        """
+        self.input_format = input_format
+        self.DEL = "<|DEL|>"
+        self.name = self.__class__.__name__
+        self.verbose = kwargs.get("verbose", False)
+
+    @abstractmethod
+    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML.
+
+        :param url: The URL of the webpage.
+        :param html: The HTML content of the webpage.
+        :return: A list of extracted blocks or chunks.
+        """
+        pass
+
+    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Process sections of text in parallel by default.
+
+        :param url: The URL of the webpage.
+        :param sections: List of sections (strings) to process.
+        :return: A list of processed JSON blocks.
+        """
+        extracted_content = []
+        with ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(self.extract, url, section, **kwargs)
+                for section in sections
+            ]
+            for future in as_completed(futures):
+                extracted_content.extend(future.result())
+        return extracted_content
+
+
+class NoExtractionStrategy(ExtractionStrategy):
+    """
+    A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
+    """
+
+    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML.
+        """
+        return [{"index": 0, "content": html}]
+
+    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        return [
+            {"index": i, "tags": [], "content": section}
+            for i, section in enumerate(sections)
+        ]
+
+
+#######################################################
+# Strategies using clustering for text data extraction #
+#######################################################
+
+
+class CosineStrategy(ExtractionStrategy):
+    """
+    Extract meaningful blocks or chunks from the given HTML using cosine similarity.
+
+    How it works:
+    1. Pre-filter documents using embeddings and semantic_filter.
+    2. Perform clustering using cosine similarity.
+    3. Organize texts by their cluster labels, retaining order.
+    4. Filter clusters by word count.
+    5. Extract meaningful blocks or chunks from the filtered clusters.
+
+    Attributes:
+        semantic_filter (str): A keyword filter for document filtering.
+        word_count_threshold (int): Minimum number of words per cluster.
+        max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
+        linkage_method (str): The linkage method for hierarchical clustering.
+        top_k (int): Number of top categories to extract.
+        model_name (str): The name of the sentence-transformers model.
+        sim_threshold (float): The similarity threshold for clustering.
+    """
+
+    def __init__(
+        self,
+        semantic_filter=None,
+        word_count_threshold=10,
+        max_dist=0.2,
+        linkage_method="ward",
+        top_k=3,
+        model_name="sentence-transformers/all-MiniLM-L6-v2",
+        sim_threshold=0.3,
+        **kwargs,
+    ):
+        """
+        Initialize the strategy with clustering parameters.
+
+        Args:
+            semantic_filter (str): A keyword filter for document filtering.
+            word_count_threshold (int): Minimum number of words per cluster.
+            max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
+            linkage_method (str): The linkage method for hierarchical clustering.
+            top_k (int): Number of top categories to extract.
+        """
+        super().__init__(**kwargs)
+
+        import numpy as np
+
+        self.semantic_filter = semantic_filter
+        self.word_count_threshold = word_count_threshold
+        self.max_dist = max_dist
+        self.linkage_method = linkage_method
+        self.top_k = top_k
+        self.sim_threshold = sim_threshold
+        self.timer = time.time()
+        self.verbose = kwargs.get("verbose", False)
+
+        self.buffer_embeddings = np.array([])
+        self.get_embedding_method = "direct"
+
+        self.device = get_device()
+        # import torch
+        # self.device = torch.device('cpu')
+
+        self.default_batch_size = calculate_batch_size(self.device)
+
+        if self.verbose:
+            print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
+
+        # if False and self.device.type == "cpu":
+        #     self.model = load_onnx_all_MiniLM_l6_v2()
+        #     self.tokenizer = self.model.tokenizer
+        #     self.get_embedding_method = "direct"
+        # else:
+
+        self.tokenizer, self.model = load_HF_embedding_model(model_name)
+        self.model.to(self.device)
+        self.model.eval()
+
+        self.get_embedding_method = "batch"
+
+        self.buffer_embeddings = np.array([])
+
+        # if model_name == "bert-base-uncased":
+        #     self.tokenizer, self.model = load_bert_base_uncased()
+        #     self.model.eval()  # Ensure the model is in evaluation mode
+        #     self.get_embedding_method = "batch"
+        # elif model_name == "BAAI/bge-small-en-v1.5":
+        #     self.tokenizer, self.model = load_bge_small_en_v1_5()
+        #     self.model.eval()  # Ensure the model is in evaluation mode
+        #     self.get_embedding_method = "batch"
+        # elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
+        #     self.model = load_onnx_all_MiniLM_l6_v2()
+        #     self.tokenizer = self.model.tokenizer
+        #     self.get_embedding_method = "direct"
+
+        if self.verbose:
+            print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
+
+        self.nlp, _ = load_text_multilabel_classifier()
+        # self.default_batch_size = 16 if self.device.type == 'cpu' else 64
+
+        if self.verbose:
+            print(
+                f"[LOG] Model loaded {model_name}, models/reuters, took "
+                + str(time.time() - self.timer)
+                + " seconds"
+            )
+
+    def filter_documents_embeddings(
+        self, documents: List[str], semantic_filter: str, at_least_k: int = 20
+    ) -> List[str]:
+        """
+        Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
+
+        Args:
+            documents (List[str]): A list of document texts.
+            semantic_filter (str): A keyword filter for document filtering.
+            at_least_k (int): The minimum number of documents to return.
+
+        Returns:
+            List[str]: A list of filtered and sorted document texts.
+        """
+
+        if not semantic_filter:
+            return documents
+
+        if len(documents) < at_least_k:
+            at_least_k = len(documents) // 2
+
+        from sklearn.metrics.pairwise import cosine_similarity
+
+        # Compute embedding for the keyword filter
+        query_embedding = self.get_embeddings([semantic_filter])[0]
+
+        # Compute embeddings for the documents
+        document_embeddings = self.get_embeddings(documents)
+
+        # Calculate cosine similarity between the query embedding and document embeddings
+        similarities = cosine_similarity(
+            [query_embedding], document_embeddings
+        ).flatten()
+
+        # Filter documents based on the similarity threshold
+        filtered_docs = [
+            (doc, sim)
+            for doc, sim in zip(documents, similarities)
+            if sim >= self.sim_threshold
+        ]
+
+        # If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
+        if len(filtered_docs) < at_least_k:
+            remaining_docs = [
+                (doc, sim)
+                for doc, sim in zip(documents, similarities)
+                if sim < self.sim_threshold
+            ]
+            remaining_docs.sort(key=lambda x: x[1], reverse=True)
+            filtered_docs.extend(remaining_docs[: at_least_k - len(filtered_docs)])
+
+        # Extract the document texts from the tuples
+        filtered_docs = [doc for doc, _ in filtered_docs]
+
+        return filtered_docs[:at_least_k]
+
+    def get_embeddings(
+        self, sentences: List[str], batch_size=None, bypass_buffer=False
+    ):
+        """
+        Get BERT embeddings for a list of sentences.
+
+        Args:
+            sentences (List[str]): A list of text chunks (sentences).
+
+        Returns:
+            NumPy array of embeddings.
+        """
+        # if self.buffer_embeddings.any() and not bypass_buffer:
+        #     return self.buffer_embeddings
+
+        if self.device.type in ["cpu", "gpu", "cuda", "mps"]:
+            import torch
+
+            # Tokenize sentences and convert to tensor
+            if batch_size is None:
+                batch_size = self.default_batch_size
+
+            all_embeddings = []
+            for i in range(0, len(sentences), batch_size):
+                batch_sentences = sentences[i : i + batch_size]
+                encoded_input = self.tokenizer(
+                    batch_sentences, padding=True, truncation=True, return_tensors="pt"
+                )
+                encoded_input = {
+                    key: tensor.to(self.device) for key, tensor in encoded_input.items()
+                }
+
+                # Ensure no gradients are calculated
+                with torch.no_grad():
+                    model_output = self.model(**encoded_input)
+
+                # Get embeddings from the last hidden state (mean pooling)
+                embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
+                all_embeddings.append(embeddings)
+
+            self.buffer_embeddings = np.vstack(all_embeddings)
+        elif self.device.type == "cpu":
+            # self.buffer_embeddings = self.model(sentences)
+            if batch_size is None:
+                batch_size = self.default_batch_size
+
+            all_embeddings = []
+            for i in range(0, len(sentences), batch_size):
+                batch_sentences = sentences[i : i + batch_size]
+                embeddings = self.model(batch_sentences)
+                all_embeddings.append(embeddings)
+
+            self.buffer_embeddings = np.vstack(all_embeddings)
+        return self.buffer_embeddings
+
+    def hierarchical_clustering(self, sentences: List[str], embeddings=None):
+        """
+        Perform hierarchical clustering on sentences and return cluster labels.
+
+        Args:
+            sentences (List[str]): A list of text chunks (sentences).
+
+        Returns:
+            NumPy array of cluster labels.
+        """
+        # Get embeddings
+        from scipy.cluster.hierarchy import linkage, fcluster
+        from scipy.spatial.distance import pdist
+
+        self.timer = time.time()
+        embeddings = self.get_embeddings(sentences, bypass_buffer=True)
+        # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
+        # Compute pairwise cosine distances
+        distance_matrix = pdist(embeddings, "cosine")
+        # Perform agglomerative clustering respecting order
+        linked = linkage(distance_matrix, method=self.linkage_method)
+        # Form flat clusters
+        labels = fcluster(linked, self.max_dist, criterion="distance")
+        return labels
+
+    def filter_clusters_by_word_count(
+        self, clusters: Dict[int, List[str]]
+    ) -> Dict[int, List[str]]:
+        """
+        Filter clusters to remove those with a word count below the threshold.
+
+        Args:
+            clusters (Dict[int, List[str]]): Dictionary of clusters.
+
+        Returns:
+            Dict[int, List[str]]: Filtered dictionary of clusters.
+        """
+        filtered_clusters = {}
+        for cluster_id, texts in clusters.items():
+            # Concatenate texts for analysis
+            full_text = " ".join(texts)
+            # Count words
+            word_count = len(full_text.split())
+
+            # Keep clusters with word count above the threshold
+            if word_count >= self.word_count_threshold:
+                filtered_clusters[cluster_id] = texts
+
+        return filtered_clusters
+
+    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract clusters from HTML content using hierarchical clustering.
+
+        Args:
+            url (str): The URL of the webpage.
+            html (str): The HTML content of the webpage.
+
+        Returns:
+            List[Dict[str, Any]]: A list of processed JSON blocks.
+        """
+        # Assume `html` is a list of text chunks for this strategy
+        t = time.time()
+        text_chunks = html.split(self.DEL)  # Split by lines or paragraphs as needed
+
+        # Pre-filter documents using embeddings and semantic_filter
+        text_chunks = self.filter_documents_embeddings(
+            text_chunks, self.semantic_filter
+        )
+
+        if not text_chunks:
+            return []
+
+        # Perform clustering
+        labels = self.hierarchical_clustering(text_chunks)
+        # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds")
+
+        # Organize texts by their cluster labels, retaining order
+        t = time.time()
+        clusters = {}
+        for index, label in enumerate(labels):
+            clusters.setdefault(label, []).append(text_chunks[index])
+
+        # Filter clusters by word count
+        filtered_clusters = self.filter_clusters_by_word_count(clusters)
+
+        # Convert filtered clusters to a sorted list of dictionaries
+        cluster_list = [
+            {"index": int(idx), "tags": [], "content": " ".join(filtered_clusters[idx])}
+            for idx in sorted(filtered_clusters)
+        ]
+
+        if self.verbose:
+            print(f"[LOG] 🚀 Assign tags using {self.device}")
+
+        if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
+            labels = self.nlp([cluster["content"] for cluster in cluster_list])
+
+            for cluster, label in zip(cluster_list, labels):
+                cluster["tags"] = label
+        # elif self.device.type == "cpu":
+        #     # Process the text with the loaded model
+        #     texts = [cluster['content'] for cluster in cluster_list]
+        #     # Batch process texts
+        #     docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
+
+        #     for doc, cluster in zip(docs, cluster_list):
+        #         tok_k = self.top_k
+        #         top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+        #         cluster['tags'] = [cat for cat, _ in top_categories]
+
+        # for cluster in  cluster_list:
+        #     doc = self.nlp(cluster['content'])
+        #     tok_k = self.top_k
+        #     top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+        #     cluster['tags'] = [cat for cat, _ in top_categories]
+
+        if self.verbose:
+            print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
+
+        return cluster_list
+
+    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Process sections using hierarchical clustering.
+
+        Args:
+            url (str): The URL of the webpage.
+            sections (List[str]): List of sections (strings) to process.
+
+        Returns:
+        """
+        # This strategy processes all sections together
+
+        return self.extract(url, self.DEL.join(sections), **kwargs)
+
+
+#######################################################
+# Strategies using LLM-based extraction for text data #
+#######################################################
+class LLMExtractionStrategy(ExtractionStrategy):
+    """
+    A strategy that uses an LLM to extract meaningful content from the HTML.
+
+    Attributes:
+        llm_config: The LLM configuration object.
+        instruction: The instruction to use for the LLM model.
+        schema: Pydantic model schema for structured data.
+        extraction_type: "block" or "schema".
+        chunk_token_threshold: Maximum tokens per chunk.
+        overlap_rate: Overlap between chunks.
+        word_token_rate: Word to token conversion rate.
+        apply_chunking: Whether to apply chunking.
+        verbose: Whether to print verbose output.
+        usages: List of individual token usages.
+        total_usage: Accumulated token usage.
+    """
+    _UNWANTED_PROPS = {
+            'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
+            'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
+            'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+            'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+        }
+    def __init__(
+        self,
+        llm_config: 'LLMConfig' = None,
+        instruction: str = None,
+        schema: Dict = None,
+        extraction_type="block",
+        chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
+        overlap_rate=OVERLAP_RATE,
+        word_token_rate=WORD_TOKEN_RATE,
+        apply_chunking=True,
+        input_format: str = "markdown",
+        force_json_response=False,
+        verbose=False,
+        # Deprecated arguments
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        base_url: str = None,
+        api_base: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the strategy with clustering parameters.
+
+        Args:
+            llm_config: The LLM configuration object.
+            instruction: The instruction to use for the LLM model.
+            schema: Pydantic model schema for structured data.
+            extraction_type: "block" or "schema".
+            chunk_token_threshold: Maximum tokens per chunk.
+            overlap_rate: Overlap between chunks.
+            word_token_rate: Word to token conversion rate.
+            apply_chunking: Whether to apply chunking.
+            input_format: Content format to use for extraction.
+                            Options: "markdown" (default), "html", "fit_markdown"
+            force_json_response: Whether to force a JSON response from the LLM.
+            verbose: Whether to print verbose output.
+
+            # Deprecated arguments, will be removed very soon
+            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
+            api_token: The API token for the provider.
+            base_url: The base URL for the API request.
+            api_base: The base URL for the API request.
+            extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
+        """
+        super().__init__( input_format=input_format, **kwargs)
+        self.llm_config = llm_config
+        if not self.llm_config:
+            self.llm_config = create_llm_config(
+                provider=DEFAULT_PROVIDER,
+                api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY),
+            )
+        self.instruction = instruction
+        self.extract_type = extraction_type
+        self.schema = schema
+        if schema:
+            self.extract_type = "schema"
+        self.force_json_response = force_json_response
+        self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD
+        self.overlap_rate = overlap_rate
+        self.word_token_rate = word_token_rate
+        self.apply_chunking = apply_chunking
+        self.extra_args = kwargs.get("extra_args", {})
+        if not self.apply_chunking:
+            self.chunk_token_threshold = 1e9
+        self.verbose = verbose
+        self.usages = []  # Store individual usages
+        self.total_usage = TokenUsage()  # Accumulated usage
+
+        self.provider = provider
+        self.api_token = api_token
+        self.base_url = base_url
+        self.api_base = api_base
+
+    
+    def __setattr__(self, name, value):
+        """Handle attribute setting."""
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        sig = inspect.signature(self.__init__)
+        all_params = sig.parameters  # Dictionary of parameter names and their details
+
+        if name in self._UNWANTED_PROPS and value is not all_params[name].default:
+            raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        
+        super().__setattr__(name, value)  
+        
+    def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML using an LLM.
+
+        How it works:
+        1. Construct a prompt with variables.
+        2. Make a request to the LLM using the prompt.
+        3. Parse the response and extract blocks or chunks.
+
+        Args:
+            url: The URL of the webpage.
+            ix: Index of the block.
+            html: The HTML content of the webpage.
+
+        Returns:
+            A list of extracted blocks or chunks.
+        """
+        if self.verbose:
+            # print("[LOG] Extracting blocks from URL:", url)
+            print(f"[LOG] Call LLM for {url} - block index: {ix}")
+
+        variable_values = {
+            "URL": url,
+            "HTML": escape_json_string(sanitize_html(html)),
+        }
+
+        prompt_with_variables = PROMPT_EXTRACT_BLOCKS
+        if self.instruction:
+            variable_values["REQUEST"] = self.instruction
+            prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
+
+        if self.extract_type == "schema" and self.schema:
+            variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) # if type of self.schema is dict else self.schema
+            prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
+
+        if self.extract_type == "schema" and not self.schema:
+            prompt_with_variables = PROMPT_EXTRACT_INFERRED_SCHEMA
+
+        for variable in variable_values:
+            prompt_with_variables = prompt_with_variables.replace(
+                "{" + variable + "}", variable_values[variable]
+            )
+
+        try:
+            response = perform_completion_with_backoff(
+                self.llm_config.provider,
+                prompt_with_variables,
+                self.llm_config.api_token,
+                base_url=self.llm_config.base_url,
+                json_response=self.force_json_response,
+                extra_args=self.extra_args,
+            )  # , json_response=self.extract_type == "schema")
+            # Track usage
+            usage = TokenUsage(
+                completion_tokens=response.usage.completion_tokens,
+                prompt_tokens=response.usage.prompt_tokens,
+                total_tokens=response.usage.total_tokens,
+                completion_tokens_details=response.usage.completion_tokens_details.__dict__
+                if response.usage.completion_tokens_details
+                else {},
+                prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
+                if response.usage.prompt_tokens_details
+                else {},
+            )
+            self.usages.append(usage)
+
+            # Update totals
+            self.total_usage.completion_tokens += usage.completion_tokens
+            self.total_usage.prompt_tokens += usage.prompt_tokens
+            self.total_usage.total_tokens += usage.total_tokens
+
+            try:
+                response = response.choices[0].message.content
+                blocks = None
+
+                if self.force_json_response:
+                    blocks = json.loads(response)
+                    if isinstance(blocks, dict):
+                        # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
+                        if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
+                            blocks = list(blocks.values())[0]
+                        else:
+                            # If it has only one key which value is not list then assign that to blocks, exampled: { "article_id": "1234", ... }
+                            blocks = [blocks]
+                    elif isinstance(blocks, list):
+                        # If it is a list then assign that to blocks
+                        blocks = blocks
+                else: 
+                    # blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
+                    blocks = extract_xml_data(["blocks"], response)["blocks"]
+                    blocks = json.loads(blocks)
+
+                for block in blocks:
+                    block["error"] = False
+            except Exception:
+                parsed, unparsed = split_and_parse_json_objects(
+                    response.choices[0].message.content
+                )
+                blocks = parsed
+                if unparsed:
+                    blocks.append(
+                        {"index": 0, "error": True, "tags": ["error"], "content": unparsed}
+                    )
+
+            if self.verbose:
+                print(
+                    "[LOG] Extracted",
+                    len(blocks),
+                    "blocks from URL:",
+                    url,
+                    "block index:",
+                    ix,
+                )
+            return blocks
+        except Exception as e:
+            if self.verbose:
+                print(f"[LOG] Error in LLM extraction: {e}")
+            # Add error information to extracted_content
+            return [
+                {
+                    "index": ix,
+                    "error": True,
+                    "tags": ["error"],
+                    "content": str(e),
+                }
+            ]
+
+    def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]:
+        """
+        Merge documents into sections based on chunk_token_threshold and overlap.
+        """
+        sections =  merge_chunks(
+            docs = documents,
+            target_size= chunk_token_threshold,
+            overlap=overlap,
+            word_token_ratio=self.word_token_rate
+        )
+        return sections
+
+    def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
+        """
+        Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
+
+        Args:
+            url: The URL of the webpage.
+            sections: List of sections (strings) to process.
+
+        Returns:
+            A list of extracted blocks or chunks.
+        """
+
+        merged_sections = self._merge(
+            sections,
+            self.chunk_token_threshold,
+            overlap=int(self.chunk_token_threshold * self.overlap_rate),
+        )
+        extracted_content = []
+        if self.llm_config.provider.startswith("groq/"):
+            # Sequential processing with a delay
+            for ix, section in enumerate(merged_sections):
+                extract_func = partial(self.extract, url)
+                extracted_content.extend(
+                    extract_func(ix, sanitize_input_encode(section))
+                )
+                time.sleep(0.5)  # 500 ms delay between each processing
+        else:
+            # Parallel processing using ThreadPoolExecutor
+            # extract_func = partial(self.extract, url)
+            # for ix, section in enumerate(merged_sections):
+            #     extracted_content.append(extract_func(ix, section))
+
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                extract_func = partial(self.extract, url)
+                futures = [
+                    executor.submit(extract_func, ix, sanitize_input_encode(section))
+                    for ix, section in enumerate(merged_sections)
+                ]
+
+                for future in as_completed(futures):
+                    try:
+                        extracted_content.extend(future.result())
+                    except Exception as e:
+                        if self.verbose:
+                            print(f"Error in thread execution: {e}")
+                        # Add error information to extracted_content
+                        extracted_content.append(
+                            {
+                                "index": 0,
+                                "error": True,
+                                "tags": ["error"],
+                                "content": str(e),
+                            }
+                        )
+
+        return extracted_content
+
+    def show_usage(self) -> None:
+        """Print a detailed token usage report showing total and per-request usage."""
+        print("\n=== Token Usage Summary ===")
+        print(f"{'Type':<15} {'Count':>12}")
+        print("-" * 30)
+        print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
+        print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
+        print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
+
+        print("\n=== Usage History ===")
+        print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
+        print("-" * 48)
+        for i, usage in enumerate(self.usages, 1):
+            print(
+                f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
+            )
+
+
+#######################################################
+# New extraction strategies for JSON-based extraction #
+#######################################################
+class JsonElementExtractionStrategy(ExtractionStrategy):
+    """
+    Abstract base class for extracting structured JSON from HTML content.
+
+    How it works:
+    1. Parses HTML content using the `_parse_html` method.
+    2. Uses a schema to define base selectors, fields, and transformations.
+    3. Extracts data hierarchically, supporting nested fields and lists.
+    4. Handles computed fields with expressions or functions.
+
+    Attributes:
+        DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
+        _extract_item(element, fields): Extracts fields from a single element.
+        _extract_single_field(element, field): Extracts a single field based on its type.
+        _apply_transform(value, transform): Applies a transformation to a value.
+        _compute_field(item, field): Computes a field value using an expression or function.
+        run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
+
+    Abstract Methods:
+        _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
+        _get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
+        _get_elements(element, selector): Retrieves child elements using a selector.
+        _get_element_text(element): Extracts text content from an element.
+        _get_element_html(element): Extracts raw HTML from an element.
+        _get_element_attribute(element, attribute): Extracts an attribute's value from an element.
+    """
+
+    DEL = "\n"
+
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        """
+        Initialize the JSON element extraction strategy with a schema.
+
+        Args:
+            schema (Dict[str, Any]): The schema defining the extraction rules.
+        """
+        super().__init__(**kwargs)
+        self.schema = schema
+        self.verbose = kwargs.get("verbose", False)
+
+    def extract(
+        self, url: str, html_content: str, *q, **kwargs
+    ) -> List[Dict[str, Any]]:
+        """
+        Extract structured data from HTML content.
+
+        How it works:
+        1. Parses the HTML content using the `_parse_html` method.
+        2. Identifies base elements using the schema's base selector.
+        3. Extracts fields from each base element using `_extract_item`.
+
+        Args:
+            url (str): The URL of the page being processed.
+            html_content (str): The raw HTML content to parse and extract.
+            *q: Additional positional arguments.
+            **kwargs: Additional keyword arguments for custom extraction.
+
+        Returns:
+            List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
+        """
+
+        parsed_html = self._parse_html(html_content)
+        base_elements = self._get_base_elements(
+            parsed_html, self.schema["baseSelector"]
+        )
+
+        results = []
+        for element in base_elements:
+            # Extract base element attributes
+            item = {}
+            if "baseFields" in self.schema:
+                for field in self.schema["baseFields"]:
+                    value = self._extract_single_field(element, field)
+                    if value is not None:
+                        item[field["name"]] = value
+
+            # Extract child fields
+            field_data = self._extract_item(element, self.schema["fields"])
+            item.update(field_data)
+
+            if item:
+                results.append(item)
+
+        return results
+
+    @abstractmethod
+    def _parse_html(self, html_content: str):
+        """Parse HTML content into appropriate format"""
+        pass
+
+    @abstractmethod
+    def _get_base_elements(self, parsed_html, selector: str):
+        """Get all base elements using the selector"""
+        pass
+
+    @abstractmethod
+    def _get_elements(self, element, selector: str):
+        """Get child elements using the selector"""
+        pass
+
+    def _extract_field(self, element, field):
+        try:
+            if field["type"] == "nested":
+                nested_elements = self._get_elements(element, field["selector"])
+                nested_element = nested_elements[0] if nested_elements else None
+                return (
+                    self._extract_item(nested_element, field["fields"])
+                    if nested_element
+                    else {}
+                )
+
+            if field["type"] == "list":
+                elements = self._get_elements(element, field["selector"])
+                return [self._extract_list_item(el, field["fields"]) for el in elements]
+
+            if field["type"] == "nested_list":
+                elements = self._get_elements(element, field["selector"])
+                return [self._extract_item(el, field["fields"]) for el in elements]
+
+            return self._extract_single_field(element, field)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error extracting field {field['name']}: {str(e)}")
+            return field.get("default")
+
+    def _extract_single_field(self, element, field):
+        """
+        Extract a single field based on its type.
+
+        How it works:
+        1. Selects the target element using the field's selector.
+        2. Extracts the field value based on its type (e.g., text, attribute, regex).
+        3. Applies transformations if defined in the schema.
+
+        Args:
+            element: The base element to extract the field from.
+            field (Dict[str, Any]): The field definition in the schema.
+
+        Returns:
+            Any: The extracted field value.
+        """
+
+        if "selector" in field:
+            selected = self._get_elements(element, field["selector"])
+            if not selected:
+                return field.get("default")
+            selected = selected[0]
+        else:
+            selected = element
+
+        value = None
+        if field["type"] == "text":
+            value = self._get_element_text(selected)
+        elif field["type"] == "attribute":
+            value = self._get_element_attribute(selected, field["attribute"])
+        elif field["type"] == "html":
+            value = self._get_element_html(selected)
+        elif field["type"] == "regex":
+            text = self._get_element_text(selected)
+            match = re.search(field["pattern"], text)
+            value = match.group(1) if match else None
+
+        if "transform" in field:
+            value = self._apply_transform(value, field["transform"])
+
+        return value if value is not None else field.get("default")
+
+    def _extract_list_item(self, element, fields):
+        item = {}
+        for field in fields:
+            value = self._extract_single_field(element, field)
+            if value is not None:
+                item[field["name"]] = value
+        return item
+
+    def _extract_item(self, element, fields):
+        """
+        Extracts fields from a given element.
+
+        How it works:
+        1. Iterates through the fields defined in the schema.
+        2. Handles computed, single, and nested field types.
+        3. Updates the item dictionary with extracted field values.
+
+        Args:
+            element: The base element to extract fields from.
+            fields (List[Dict[str, Any]]): The list of fields to extract.
+
+        Returns:
+            Dict[str, Any]: A dictionary representing the extracted item.
+        """
+
+        item = {}
+        for field in fields:
+            if field["type"] == "computed":
+                value = self._compute_field(item, field)
+            else:
+                value = self._extract_field(element, field)
+            if value is not None:
+                item[field["name"]] = value
+        return item
+
+    def _apply_transform(self, value, transform):
+        """
+        Apply a transformation to a value.
+
+        How it works:
+        1. Checks the transformation type (e.g., `lowercase`, `strip`).
+        2. Applies the transformation to the value.
+        3. Returns the transformed value.
+
+        Args:
+            value (str): The value to transform.
+            transform (str): The type of transformation to apply.
+
+        Returns:
+            str: The transformed value.
+        """
+
+        if transform == "lowercase":
+            return value.lower()
+        elif transform == "uppercase":
+            return value.upper()
+        elif transform == "strip":
+            return value.strip()
+        return value
+
+    def _compute_field(self, item, field):
+        try:
+            if "expression" in field:
+                return eval(field["expression"], {}, item)
+            elif "function" in field:
+                return field["function"](item)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error computing field {field['name']}: {str(e)}")
+            return field.get("default")
+
+    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Run the extraction strategy on a combined HTML content.
+
+        How it works:
+        1. Combines multiple HTML sections using the `DEL` delimiter.
+        2. Calls the `extract` method with the combined HTML.
+
+        Args:
+            url (str): The URL of the page being processed.
+            sections (List[str]): A list of HTML sections.
+            *q: Additional positional arguments.
+            **kwargs: Additional keyword arguments for custom extraction.
+
+        Returns:
+            List[Dict[str, Any]]: A list of extracted items.
+        """
+
+        combined_html = self.DEL.join(sections)
+        return self.extract(url, combined_html, **kwargs)
+
+    @abstractmethod
+    def _get_element_text(self, element) -> str:
+        """Get text content from element"""
+        pass
+
+    @abstractmethod
+    def _get_element_html(self, element) -> str:
+        """Get HTML content from element"""
+        pass
+
+    @abstractmethod
+    def _get_element_attribute(self, element, attribute: str):
+        """Get attribute value from element"""
+        pass
+
+    _GENERATE_SCHEMA_UNWANTED_PROPS = {
+        'provider': 'Instead, use llm_config=LLMConfig(provider="...")',
+        'api_token': 'Instead, use llm_config=LlMConfig(api_token="...")',
+    }
+
+    @staticmethod
+    def generate_schema(
+        html: str,
+        schema_type: str = "CSS", # or XPATH
+        query: str = None,
+        target_json_example: str = None,
+        llm_config: 'LLMConfig' = create_llm_config(),
+        provider: str = None,
+        api_token: str = None,
+        **kwargs
+    ) -> dict:
+        """
+        Generate extraction schema from HTML content and optional query.
+        
+        Args:
+            html (str): The HTML content to analyze
+            query (str, optional): Natural language description of what data to extract
+            provider (str): Legacy Parameter. LLM provider to use 
+            api_token (str): Legacy Parameter. API token for LLM provider
+            llm_config (LLMConfig): LLM configuration object
+            prompt (str, optional): Custom prompt template to use
+            **kwargs: Additional args passed to LLM processor
+            
+        Returns:
+            dict: Generated schema following the JsonElementExtractionStrategy format
+        """
+        from .prompts import JSON_SCHEMA_BUILDER
+        from .utils import perform_completion_with_backoff
+        for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
+            if locals()[name] is not None:
+                raise AttributeError(f"Setting '{name}' is deprecated. {message}")
+        
+        # Use default or custom prompt
+        prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH
+        
+        # Build the prompt
+        system_message = {
+            "role": "system", 
+            "content": f"""You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. We use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.
+
+Generating this HTML manually is not feasible, so you need to generate the JSON schema using the HTML content. The HTML copied from the crawled website is provided below, which we believe contains the repetitive pattern.
+
+# Schema main keys:
+- name: This is the name of the schema.
+- baseSelector: This is the CSS or XPATH selector that identifies the base element that contains all the repetitive patterns.
+- baseFields: This is a list of fields that you extract from the base element itself.
+- fields: This is a list of fields that you extract from the children of the base element. {{name, selector, type}} based on the type, you may have extra keys such as "attribute" when the type is "attribute".
+
+# Extra Context:
+In this context, the following items may or may not be present:
+- Example of target JSON object: This is a sample of the final JSON object that we hope to extract from the HTML using the schema you are generating.
+- Extra Instructions: This is optional instructions to consider when generating the schema provided by the user.
+- Query or explanation of target/goal data item: This is a description of what data we are trying to extract from the HTML. This explanation means we're not sure about the rigid schema of the structures we want, so we leave it to you to use your expertise to create the best and most comprehensive structures aimed at maximizing data extraction from this page. You must ensure that you do not pick up nuances that may exist on a particular page. The focus should be on the data we are extracting, and it must be valid, safe, and robust based on the given HTML.
+
+# What if there is no example of target JSON object and also no extra instructions or even no explanation of target/goal data item?
+In this scenario, use your best judgment to generate the schema. You need to examine the content of the page and understand the data it provides. If the page contains repetitive data, such as lists of items, products, jobs, places, books, or movies, focus on one single item that repeats. If the page is a detailed page about one product or item, create a schema to extract the entire structured data. At this stage, you must think and decide for yourself. Try to maximize the number of fields that you can extract from the HTML.
+
+# What are the instructions and details for this schema generation?
+{prompt_template}"""
+        }
+        
+        user_message = {
+            "role": "user",
+            "content": f"""
+                HTML to analyze:
+                ```html
+                {html}
+                ```
+                """
+        }
+
+        if query:
+            user_message["content"] += f"\n\n## Query or explanation of target/goal data item:\n{query}"
+        if target_json_example:
+            user_message["content"] += f"\n\n## Example of target JSON object:\n```json\n{target_json_example}\n```"
+
+        if query and not target_json_example:
+            user_message["content"] += """IMPORTANT: To remind you, in this process, we are not providing a rigid example of the adjacent objects we seek. We rely on your understanding of the explanation provided in the above section. Make sure to grasp what we are looking for and, based on that, create the best schema.."""
+        elif not query and target_json_example:
+            user_message["content"] += """IMPORTANT: Please remember that in this process, we provided a proper example of a target JSON object. Make sure to adhere to the structure and create a schema that exactly fits this example. If you find that some elements on the page do not match completely, vote for the majority."""
+        elif not query and not target_json_example:
+            user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
+        
+        user_message["content"] += """IMPORTANT: Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
+
+        Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
+        """
+
+        try:
+            # Call LLM with backoff handling
+            response = perform_completion_with_backoff(
+                provider=llm_config.provider,
+                prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
+                json_response = True,                
+                api_token=llm_config.api_token,
+                base_url=llm_config.base_url,
+                extra_args=kwargs
+            )
+            
+            # Extract and return schema
+            return json.loads(response.choices[0].message.content)
+            
+        except Exception as e:
+            raise Exception(f"Failed to generate schema: {str(e)}")
+
+class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
+    """
+    Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
+
+    How it works:
+    1. Parses HTML content with BeautifulSoup.
+    2. Selects elements using CSS selectors defined in the schema.
+    3. Extracts field data and applies transformations as defined.
+
+    Attributes:
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        _parse_html(html_content): Parses HTML content into a BeautifulSoup object.
+        _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
+        _get_elements(element, selector): Selects child elements using a CSS selector.
+        _get_element_text(element): Extracts text content from a BeautifulSoup element.
+        _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
+        _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
+    """
+
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"  # Force HTML input
+        super().__init__(schema, **kwargs)
+
+    def _parse_html(self, html_content: str):
+        # return BeautifulSoup(html_content, "html.parser")
+        return BeautifulSoup(html_content, "lxml")
+
+    def _get_base_elements(self, parsed_html, selector: str):
+        return parsed_html.select(selector)
+
+    def _get_elements(self, element, selector: str):
+        # Return all matching elements using select() instead of select_one()
+        # This ensures that we get all elements that match the selector, not just the first one
+        return element.select(selector)
+
+    def _get_element_text(self, element) -> str:
+        return element.get_text(strip=True)
+
+    def _get_element_html(self, element) -> str:
+        return str(element)
+
+    def _get_element_attribute(self, element, attribute: str):
+        return element.get(attribute)
+
+class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy):
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"
+        super().__init__(schema, **kwargs)
+        self._selector_cache = {}
+        self._xpath_cache = {}
+        self._result_cache = {}
+        
+        # Control selector optimization strategy
+        self.use_caching = kwargs.get("use_caching", True)
+        self.optimize_common_patterns = kwargs.get("optimize_common_patterns", True)
+        
+        # Load lxml dependencies once
+        from lxml import etree, html
+        from lxml.cssselect import CSSSelector
+        self.etree = etree
+        self.html_parser = html
+        self.CSSSelector = CSSSelector
+    
+    def _parse_html(self, html_content: str):
+        """Parse HTML content with error recovery"""
+        try:
+            parser = self.etree.HTMLParser(recover=True, remove_blank_text=True)
+            return self.etree.fromstring(html_content, parser)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error parsing HTML, falling back to alternative method: {e}")
+            try:
+                return self.html_parser.fromstring(html_content)
+            except Exception as e2:
+                if self.verbose:
+                    print(f"Critical error parsing HTML: {e2}")
+                # Create minimal document as fallback
+                return self.etree.Element("html")
+    
+    def _optimize_selector(self, selector_str):
+        """Optimize common selector patterns for better performance"""
+        if not self.optimize_common_patterns:
+            return selector_str
+            
+        # Handle td:nth-child(N) pattern which is very common in table scraping
+        import re
+        if re.search(r'td:nth-child\(\d+\)', selector_str):
+            return selector_str  # Already handled specially in _apply_selector
+            
+        # Split complex selectors into parts for optimization
+        parts = selector_str.split()
+        if len(parts) <= 1:
+            return selector_str
+            
+        # For very long selectors, consider using just the last specific part
+        if len(parts) > 3 and any(p.startswith('.') or p.startswith('#') for p in parts):
+            specific_parts = [p for p in parts if p.startswith('.') or p.startswith('#')]
+            if specific_parts:
+                return specific_parts[-1]  # Use most specific class/id selector
+                
+        return selector_str
+    
+    def _create_selector_function(self, selector_str):
+        """Create a selector function that handles all edge cases"""
+        original_selector = selector_str
+        
+        # Try to optimize the selector if appropriate
+        if self.optimize_common_patterns:
+            selector_str = self._optimize_selector(selector_str)
+        
+        try:
+            # Attempt to compile the CSS selector
+            compiled = self.CSSSelector(selector_str)
+            xpath = compiled.path
+            
+            # Store XPath for later use
+            self._xpath_cache[selector_str] = xpath
+            
+            # Create the wrapper function that implements the selection strategy
+            def selector_func(element, context_sensitive=True):
+                cache_key = None
+                
+                # Use result caching if enabled
+                if self.use_caching:
+                    # Create a cache key based on element and selector
+                    element_id = element.get('id', '') or str(hash(element))
+                    cache_key = f"{element_id}::{selector_str}"
+                    
+                    if cache_key in self._result_cache:
+                        return self._result_cache[cache_key]
+                
+                results = []
+                try:
+                    # Strategy 1: Direct CSS selector application (fastest)
+                    results = compiled(element)
+                    
+                    # If that fails and we need context sensitivity
+                    if not results and context_sensitive:
+                        # Strategy 2: Try XPath with context adjustment
+                        context_xpath = self._make_context_sensitive_xpath(xpath, element)
+                        if context_xpath:
+                            results = element.xpath(context_xpath)
+                        
+                        # Strategy 3: Handle special case - nth-child
+                        if not results and 'nth-child' in original_selector:
+                            results = self._handle_nth_child_selector(element, original_selector)
+                        
+                        # Strategy 4: Direct descendant search for class/ID selectors
+                        if not results:
+                            results = self._fallback_class_id_search(element, original_selector)
+                            
+                        # Strategy 5: Last resort - tag name search for the final part
+                        if not results:
+                            parts = original_selector.split()
+                            if parts:
+                                last_part = parts[-1]
+                                # Extract tag name from the selector
+                                tag_match = re.match(r'^(\w+)', last_part)
+                                if tag_match:
+                                    tag_name = tag_match.group(1)
+                                    results = element.xpath(f".//{tag_name}")
+                    
+                    # Cache results if caching is enabled
+                    if self.use_caching and cache_key:
+                        self._result_cache[cache_key] = results
+                        
+                except Exception as e:
+                    if self.verbose:
+                        print(f"Error applying selector '{selector_str}': {e}")
+                
+                return results
+                
+            return selector_func
+            
+        except Exception as e:
+            if self.verbose:
+                print(f"Error compiling selector '{selector_str}': {e}")
+            
+            # Fallback function for invalid selectors
+            return lambda element, context_sensitive=True: []
+    
+    def _make_context_sensitive_xpath(self, xpath, element):
+        """Convert absolute XPath to context-sensitive XPath"""
+        try:
+            # If starts with descendant-or-self, it's already context-sensitive
+            if xpath.startswith('descendant-or-self::'):
+                return xpath
+                
+            # Remove leading slash if present
+            if xpath.startswith('/'):
+                context_xpath = f".{xpath}"
+            else:
+                context_xpath = f".//{xpath}"
+                
+            # Validate the XPath by trying it
+            try:
+                element.xpath(context_xpath)
+                return context_xpath
+            except:
+                # If that fails, try a simpler descendant search
+                return f".//{xpath.split('/')[-1]}"
+        except:
+            return None
+    
+    def _handle_nth_child_selector(self, element, selector_str):
+        """Special handling for nth-child selectors in tables"""
+        import re
+        results = []
+        
+        try:
+            # Extract the column number from td:nth-child(N)
+            match = re.search(r'td:nth-child\((\d+)\)', selector_str)
+            if match:
+                col_num = match.group(1)
+                
+                # Check if there's content after the nth-child part
+                remaining_selector = selector_str.split(f"td:nth-child({col_num})", 1)[-1].strip()
+                
+                if remaining_selector:
+                    # If there's a specific element we're looking for after the column
+                    # Extract any tag names from the remaining selector
+                    tag_match = re.search(r'(\w+)', remaining_selector)
+                    tag_name = tag_match.group(1) if tag_match else '*'
+                    results = element.xpath(f".//td[{col_num}]//{tag_name}")
+                else:
+                    # Just get the column cell
+                    results = element.xpath(f".//td[{col_num}]")
+        except Exception as e:
+            if self.verbose:
+                print(f"Error handling nth-child selector: {e}")
+                
+        return results
+    
+    def _fallback_class_id_search(self, element, selector_str):
+        """Fallback to search by class or ID"""
+        results = []
+        
+        try:
+            # Extract class selectors (.classname)
+            import re
+            class_matches = re.findall(r'\.([a-zA-Z0-9_-]+)', selector_str)
+            
+            # Extract ID selectors (#idname)
+            id_matches = re.findall(r'#([a-zA-Z0-9_-]+)', selector_str)
+            
+            # Try each class
+            for class_name in class_matches:
+                class_results = element.xpath(f".//*[contains(@class, '{class_name}')]")
+                results.extend(class_results)
+                
+            # Try each ID (usually more specific)
+            for id_name in id_matches:
+                id_results = element.xpath(f".//*[@id='{id_name}']")
+                results.extend(id_results)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error in fallback class/id search: {e}")
+                
+        return results
+    
+    def _get_selector(self, selector_str):
+        """Get or create a selector function with caching"""
+        if selector_str not in self._selector_cache:
+            self._selector_cache[selector_str] = self._create_selector_function(selector_str)
+        return self._selector_cache[selector_str]
+    
+    def _get_base_elements(self, parsed_html, selector: str):
+        """Get all base elements using the selector"""
+        selector_func = self._get_selector(selector)
+        # For base elements, we don't need context sensitivity
+        return selector_func(parsed_html, context_sensitive=False)
+    
+    def _get_elements(self, element, selector: str):
+        """Get child elements using the selector with context sensitivity"""
+        selector_func = self._get_selector(selector)
+        return selector_func(element, context_sensitive=True)
+    
+    def _get_element_text(self, element) -> str:
+        """Extract normalized text from element"""
+        try:
+            # Get all text nodes and normalize
+            text = " ".join(t.strip() for t in element.xpath(".//text()") if t.strip())
+            return text
+        except Exception as e:
+            if self.verbose:
+                print(f"Error extracting text: {e}")
+            # Fallback
+            try:
+                return element.text_content().strip()
+            except:
+                return ""
+    
+    def _get_element_html(self, element) -> str:
+        """Get HTML string representation of element"""
+        try:
+            return self.etree.tostring(element, encoding='unicode', method='html')
+        except Exception as e:
+            if self.verbose:
+                print(f"Error serializing HTML: {e}")
+            return ""
+    
+    def _get_element_attribute(self, element, attribute: str):
+        """Get attribute value safely"""
+        try:
+            return element.get(attribute)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error getting attribute '{attribute}': {e}")
+            return None
+            
+    def _clear_caches(self):
+        """Clear caches to free memory"""
+        if self.use_caching:
+            self._result_cache.clear()
+
+class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy):
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"  # Force HTML input
+        super().__init__(schema, **kwargs)
+        self._selector_cache = {}
+    
+    def _parse_html(self, html_content: str):
+        from lxml import etree
+        parser = etree.HTMLParser(recover=True)
+        return etree.fromstring(html_content, parser)
+    
+    def _get_selector(self, selector_str):
+        """Get a selector function that works within the context of an element"""
+        if selector_str not in self._selector_cache:
+            from lxml.cssselect import CSSSelector
+            try:
+                # Store both the compiled selector and its xpath translation
+                compiled = CSSSelector(selector_str)
+                
+                # Create a function that will apply this selector appropriately
+                def select_func(element):
+                    try:
+                        # First attempt: direct CSS selector application
+                        results = compiled(element)
+                        if results:
+                            return results
+                        
+                        # Second attempt: contextual XPath selection
+                        # Convert the root-based XPath to a context-based XPath
+                        xpath = compiled.path
+                        
+                        # If the XPath already starts with descendant-or-self, handle it specially
+                        if xpath.startswith('descendant-or-self::'):
+                            context_xpath = xpath
+                        else:
+                            # For normal XPath expressions, make them relative to current context
+                            context_xpath = f"./{xpath.lstrip('/')}"
+                        
+                        results = element.xpath(context_xpath)
+                        if results:
+                            return results
+                        
+                        # Final fallback: simple descendant search for common patterns
+                        if 'nth-child' in selector_str:
+                            # Handle td:nth-child(N) pattern
+                            import re
+                            match = re.search(r'td:nth-child\((\d+)\)', selector_str)
+                            if match:
+                                col_num = match.group(1)
+                                sub_selector = selector_str.split(')', 1)[-1].strip()
+                                if sub_selector:
+                                    return element.xpath(f".//td[{col_num}]//{sub_selector}")
+                                else:
+                                    return element.xpath(f".//td[{col_num}]")
+                        
+                        # Last resort: try each part of the selector separately
+                        parts = selector_str.split()
+                        if len(parts) > 1 and parts[-1]:
+                            return element.xpath(f".//{parts[-1]}")
+                            
+                        return []
+                    except Exception as e:
+                        if self.verbose:
+                            print(f"Error applying selector '{selector_str}': {e}")
+                        return []
+                
+                self._selector_cache[selector_str] = select_func
+            except Exception as e:
+                if self.verbose:
+                    print(f"Error compiling selector '{selector_str}': {e}")
+                
+                # Fallback function for invalid selectors
+                def fallback_func(element):
+                    return []
+                
+                self._selector_cache[selector_str] = fallback_func
+                
+        return self._selector_cache[selector_str]
+    
+    def _get_base_elements(self, parsed_html, selector: str):
+        selector_func = self._get_selector(selector)
+        return selector_func(parsed_html)
+    
+    def _get_elements(self, element, selector: str):
+        selector_func = self._get_selector(selector)
+        return selector_func(element)
+    
+    def _get_element_text(self, element) -> str:
+        return "".join(element.xpath(".//text()")).strip()
+    
+    def _get_element_html(self, element) -> str:
+        from lxml import etree
+        return etree.tostring(element, encoding='unicode')
+    
+    def _get_element_attribute(self, element, attribute: str):
+        return element.get(attribute)    
+
+class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
+    """
+    Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
+
+    How it works:
+    1. Parses HTML content into an lxml tree.
+    2. Selects elements using XPath expressions.
+    3. Converts CSS selectors to XPath when needed.
+
+    Attributes:
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        _parse_html(html_content): Parses HTML content into an lxml tree.
+        _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
+        _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
+        _get_elements(element, selector): Selects child elements using an XPath selector.
+        _get_element_text(element): Extracts text content from an lxml element.
+        _get_element_html(element): Extracts the raw HTML content of an lxml element.
+        _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
+    """
+
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"  # Force HTML input
+        super().__init__(schema, **kwargs)
+
+    def _parse_html(self, html_content: str):
+        return html.fromstring(html_content)
+
+    def _get_base_elements(self, parsed_html, selector: str):
+        return parsed_html.xpath(selector)
+
+    def _css_to_xpath(self, css_selector: str) -> str:
+        """Convert CSS selector to XPath if needed"""
+        if "/" in css_selector:  # Already an XPath
+            return css_selector
+        return self._basic_css_to_xpath(css_selector)
+
+    def _basic_css_to_xpath(self, css_selector: str) -> str:
+        """Basic CSS to XPath conversion for common cases"""
+        if " > " in css_selector:
+            parts = css_selector.split(" > ")
+            return "//" + "/".join(parts)
+        if " " in css_selector:
+            parts = css_selector.split(" ")
+            return "//" + "//".join(parts)
+        return "//" + css_selector
+
+    def _get_elements(self, element, selector: str):
+        xpath = self._css_to_xpath(selector)
+        if not xpath.startswith("."):
+            xpath = "." + xpath
+        return element.xpath(xpath)
+
+    def _get_element_text(self, element) -> str:
+        return "".join(element.xpath(".//text()")).strip()
+
+    def _get_element_html(self, element) -> str:
+        return etree.tostring(element, encoding="unicode")
+
+    def _get_element_attribute(self, element, attribute: str):
+        return element.get(attribute)
+
+
+```
+
+
+## File: crawl4ai/models.py
+
+```py
+from pydantic import BaseModel, HttpUrl, PrivateAttr
+from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
+from typing import AsyncGenerator
+from typing import Generic, TypeVar
+from enum import Enum
+from dataclasses import dataclass
+from .ssl_certificate import SSLCertificate
+from datetime import datetime
+from datetime import timedelta
+
+
+###############################
+# Dispatcher Models
+###############################
+@dataclass
+class DomainState:
+    last_request_time: float = 0
+    current_delay: float = 0
+    fail_count: int = 0
+
+
+@dataclass
+class CrawlerTaskResult:
+    task_id: str
+    url: str
+    result: "CrawlResult"
+    memory_usage: float
+    peak_memory: float
+    start_time: Union[datetime, float]
+    end_time: Union[datetime, float]
+    error_message: str = ""
+    retry_count: int = 0
+    wait_time: float = 0.0
+    
+    @property
+    def success(self) -> bool:
+        return self.result.success
+
+class CrawlStatus(Enum):
+    QUEUED = "QUEUED"
+    IN_PROGRESS = "IN_PROGRESS"
+    COMPLETED = "COMPLETED"
+    FAILED = "FAILED"
+
+@dataclass
+class CrawlStats:
+    task_id: str
+    url: str
+    status: CrawlStatus
+    start_time: Optional[Union[datetime, float]] = None
+    end_time: Optional[Union[datetime, float]] = None
+    memory_usage: float = 0.0
+    peak_memory: float = 0.0
+    error_message: str = ""
+    wait_time: float = 0.0
+    retry_count: int = 0
+    counted_requeue: bool = False
+
+    @property
+    def duration(self) -> str:
+        if not self.start_time:
+            return "0:00"
+            
+        # Convert start_time to datetime if it's a float
+        start = self.start_time
+        if isinstance(start, float):
+            start = datetime.fromtimestamp(start)
+            
+        # Get end time or use current time
+        end = self.end_time or datetime.now()
+        # Convert end_time to datetime if it's a float
+        if isinstance(end, float):
+            end = datetime.fromtimestamp(end)
+            
+        duration = end - start
+        return str(timedelta(seconds=int(duration.total_seconds())))
+
+class DisplayMode(Enum):
+    DETAILED = "DETAILED"
+    AGGREGATED = "AGGREGATED"
+
+
+###############################
+# Crawler Models
+###############################
+@dataclass
+class TokenUsage:
+    completion_tokens: int = 0
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens_details: Optional[dict] = None
+    prompt_tokens_details: Optional[dict] = None
+
+class UrlModel(BaseModel):
+    url: HttpUrl
+    forced: bool = False
+
+
+
+@dataclass
+class TraversalStats:
+    """Statistics for the traversal process"""
+
+    start_time: datetime = datetime.now()
+    urls_processed: int = 0
+    urls_failed: int = 0
+    urls_skipped: int = 0
+    total_depth_reached: int = 0
+    current_depth: int = 0
+
+class DispatchResult(BaseModel):
+    task_id: str
+    memory_usage: float
+    peak_memory: float
+    start_time: Union[datetime, float]
+    end_time: Union[datetime, float]
+    error_message: str = ""
+
+class MarkdownGenerationResult(BaseModel):
+    raw_markdown: str
+    markdown_with_citations: str
+    references_markdown: str
+    fit_markdown: Optional[str] = None
+    fit_html: Optional[str] = None
+
+    def __str__(self):
+        return self.raw_markdown
+    
+class CrawlResult(BaseModel):
+    url: str
+    html: str
+    success: bool
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {}
+    links: Dict[str, List[Dict]] = {}
+    downloaded_files: Optional[List[str]] = None
+    js_execution_result: Optional[Dict[str, Any]] = None
+    screenshot: Optional[str] = None
+    pdf: Optional[bytes] = None
+    mhtml: Optional[str] = None
+    _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
+    session_id: Optional[str] = None
+    response_headers: Optional[dict] = None
+    status_code: Optional[int] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    dispatch_result: Optional[DispatchResult] = None
+    redirected_url: Optional[str] = None
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
+# and model_dump override all exist to support a smooth transition from markdown as a string
+# to markdown as a MarkdownGenerationResult object, while maintaining backward compatibility.
+# 
+# This allows code that expects markdown to be a string to continue working, while also
+# providing access to the full MarkdownGenerationResult object's properties.
+# 
+# The markdown_v2 property is deprecated and raises an error directing users to use markdown.
+# 
+# When backward compatibility is no longer needed in future versions, this entire mechanism
+# can be simplified to a standard field with no custom accessors or serialization logic.
+    
+    def __init__(self, **data):
+        markdown_result = data.pop('markdown', None)
+        super().__init__(**data)
+        if markdown_result is not None:
+            self._markdown = (
+                MarkdownGenerationResult(**markdown_result)
+                if isinstance(markdown_result, dict)
+                else markdown_result
+            )
+    
+    @property
+    def markdown(self):
+        """
+        Property that returns a StringCompatibleMarkdown object that behaves like
+        a string but also provides access to MarkdownGenerationResult attributes.
+        
+        This approach allows backward compatibility with code that expects 'markdown'
+        to be a string, while providing access to the full MarkdownGenerationResult.
+        """
+        if self._markdown is None:
+            return None
+        return StringCompatibleMarkdown(self._markdown)
+    
+    @markdown.setter
+    def markdown(self, value):
+        """
+        Setter for the markdown property.
+        """
+        self._markdown = value
+    
+    @property
+    def markdown_v2(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+
+        This property exists to inform users that 'markdown_v2' has been
+        deprecated and they should use 'markdown' instead.
+        """
+        raise AttributeError(
+            "The 'markdown_v2' attribute is deprecated and has been removed. "
+            """Please use 'markdown' instead, which now returns a MarkdownGenerationResult, with
+            following properties:
+            - raw_markdown: The raw markdown string
+            - markdown_with_citations: The markdown string with citations
+            - references_markdown: The markdown string with references
+            - fit_markdown: The markdown string with fit text
+            """
+        )
+    
+    @property
+    def fit_markdown(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+        """
+        raise AttributeError(
+            "The 'fit_markdown' attribute is deprecated and has been removed. "
+            "Please use 'markdown.fit_markdown' instead."
+        )
+    
+    @property
+    def fit_html(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+        """
+        raise AttributeError(
+            "The 'fit_html' attribute is deprecated and has been removed. "
+            "Please use 'markdown.fit_html' instead."
+        )
+
+    def model_dump(self, *args, **kwargs):
+        """
+        Override model_dump to include the _markdown private attribute in serialization.
+        
+        This override is necessary because:
+        1. PrivateAttr fields are excluded from serialization by default
+        2. We need to maintain backward compatibility by including the 'markdown' field
+           in the serialized output
+        3. We're transitioning from 'markdown_v2' to enhancing 'markdown' to hold
+           the same type of data
+        
+        Future developers: This method ensures that the markdown content is properly
+        serialized despite being stored in a private attribute. If the serialization
+        requirements change, this is where you would update the logic.
+        """
+        result = super().model_dump(*args, **kwargs)
+        if self._markdown is not None:
+            result["markdown"] = self._markdown.model_dump() 
+        return result
+
+class StringCompatibleMarkdown(str):
+    """A string subclass that also provides access to MarkdownGenerationResult attributes"""
+    def __new__(cls, markdown_result):
+        return super().__new__(cls, markdown_result.raw_markdown)
+    
+    def __init__(self, markdown_result):
+        self._markdown_result = markdown_result
+    
+    def __getattr__(self, name):
+        return getattr(self._markdown_result, name)
+
+CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
+
+class CrawlResultContainer(Generic[CrawlResultT]):
+    def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
+        # Normalize to a list
+        if isinstance(results, list):
+            self._results = results
+        else:
+            self._results = [results]
+
+    def __iter__(self):
+        return iter(self._results)
+
+    def __getitem__(self, index):
+        return self._results[index]
+
+    def __len__(self):
+        return len(self._results)
+
+    def __getattr__(self, attr):
+        # Delegate attribute access to the first element.
+        if self._results:
+            return getattr(self._results[0], attr)
+        raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self._results!r})"
+
+RunManyReturn = Union[
+    CrawlResultContainer[CrawlResultT],
+    AsyncGenerator[CrawlResultT, None]
+]
+
+
+# END of backward compatibility code for markdown/markdown_v2.
+# When removing this code in the future, make sure to:
+# 1. Replace the private attribute and property with a standard field
+# 2. Update any serialization logic that might depend on the current behavior
+
+class AsyncCrawlResponse(BaseModel):
+    html: str
+    response_headers: Dict[str, str]
+    js_execution_result: Optional[Dict[str, Any]] = None
+    status_code: int
+    screenshot: Optional[str] = None
+    pdf_data: Optional[bytes] = None
+    mhtml_data: Optional[str] = None
+    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
+    downloaded_files: Optional[List[str]] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    redirected_url: Optional[str] = None
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+###############################
+# Scraping Models
+###############################
+class MediaItem(BaseModel):
+    src: Optional[str] = ""
+    data: Optional[str] = ""
+    alt: Optional[str] = ""
+    desc: Optional[str] = ""
+    score: Optional[int] = 0
+    type: str = "image"
+    group_id: Optional[int] = 0
+    format: Optional[str] = None
+    width: Optional[int] = None
+
+
+class Link(BaseModel):
+    href: Optional[str] = ""
+    text: Optional[str] = ""
+    title: Optional[str] = ""
+    base_domain: Optional[str] = ""
+
+
+class Media(BaseModel):
+    images: List[MediaItem] = []
+    videos: List[
+        MediaItem
+    ] = []  # Using MediaItem model for now, can be extended with Video model if needed
+    audios: List[
+        MediaItem
+    ] = []  # Using MediaItem model for now, can be extended with Audio model if needed
+    tables: List[Dict] = []  # Table data extracted from HTML tables
+
+
+class Links(BaseModel):
+    internal: List[Link] = []
+    external: List[Link] = []
+
+
+class ScrapingResult(BaseModel):
+    cleaned_html: str
+    success: bool
+    media: Media = Media()
+    links: Links = Links()
+    metadata: Dict[str, Any] = {}
+
+```
+
+
+## File: crawl4ai/content_filter_strategy.py
+
+```py
+import inspect
+import re
+import time
+from bs4 import BeautifulSoup, Tag
+from typing import List, Tuple, Dict, Optional
+from rank_bm25 import BM25Okapi
+from collections import deque
+from bs4 import NavigableString, Comment
+
+from .utils import (
+    clean_tokens,
+    perform_completion_with_backoff,
+    escape_json_string,
+    sanitize_html,
+    get_home_folder,
+    extract_xml_data,
+    merge_chunks,
+)
+from .types import LLMConfig
+from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
+from abc import ABC, abstractmethod
+import math
+from snowballstemmer import stemmer
+from .models import TokenUsage
+from .prompts import PROMPT_FILTER_CONTENT
+import json
+import hashlib
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+from .async_logger import AsyncLogger, LogLevel
+from colorama import Fore, Style
+
+
+class RelevantContentFilter(ABC):
+    """Abstract base class for content filtering strategies"""
+
+    def __init__(
+        self,
+        user_query: str = None,
+        verbose: bool = False,
+        logger: Optional[AsyncLogger] = None,
+    ):
+        """
+        Initializes the RelevantContentFilter class with optional user query.
+
+        Args:
+            user_query (str): User query for filtering (optional).
+            verbose (bool): Enable verbose logging (default: False).
+        """
+        self.user_query = user_query
+        self.included_tags = {
+            # Primary structure
+            "article",
+            "main",
+            "section",
+            "div",
+            # List structures
+            "ul",
+            "ol",
+            "li",
+            "dl",
+            "dt",
+            "dd",
+            # Text content
+            "p",
+            "span",
+            "blockquote",
+            "pre",
+            "code",
+            # Headers
+            "h1",
+            "h2",
+            "h3",
+            "h4",
+            "h5",
+            "h6",
+            # Tables
+            "table",
+            "thead",
+            "tbody",
+            "tr",
+            "td",
+            "th",
+            # Other semantic elements
+            "figure",
+            "figcaption",
+            "details",
+            "summary",
+            # Text formatting
+            "em",
+            "strong",
+            "b",
+            "i",
+            "mark",
+            "small",
+            # Rich content
+            "time",
+            "address",
+            "cite",
+            "q",
+        }
+        self.excluded_tags = {
+            "nav",
+            "footer",
+            "header",
+            "aside",
+            "script",
+            "style",
+            "form",
+            "iframe",
+            "noscript",
+        }
+        self.header_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
+        self.negative_patterns = re.compile(
+            r"nav|footer|header|sidebar|ads|comment|promo|advert|social|share", re.I
+        )
+        self.min_word_count = 2
+        self.verbose = False
+        self.logger = logger
+
+    @abstractmethod
+    def filter_content(self, html: str) -> List[str]:
+        """Abstract method to be implemented by specific filtering strategies"""
+        pass
+
+    def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str:
+        """Common method to extract page metadata with fallbacks"""
+        if self.user_query:
+            return self.user_query
+
+        query_parts = []
+
+        # Title
+        try:
+            title = soup.title.string
+            if title:
+                query_parts.append(title)
+        except Exception:
+            pass
+
+        if soup.find("h1"):
+            query_parts.append(soup.find("h1").get_text())
+
+        # Meta tags
+        temp = ""
+        for meta_name in ["keywords", "description"]:
+            meta = soup.find("meta", attrs={"name": meta_name})
+            if meta and meta.get("content"):
+                query_parts.append(meta["content"])
+                temp += meta["content"]
+
+        # If still empty, grab first significant paragraph
+        if not temp:
+            # Find the first tag P thatits text contains more than 50 characters
+            for p in body.find_all("p"):
+                if len(p.get_text()) > 150:
+                    query_parts.append(p.get_text()[:150])
+                    break
+
+        return " ".join(filter(None, query_parts))
+
+    def extract_text_chunks(
+        self, body: Tag, min_word_threshold: int = None
+    ) -> List[Tuple[str, str]]:
+        """
+        Extracts text chunks from a BeautifulSoup body element while preserving order.
+        Returns list of tuples (text, tag_name) for classification.
+
+        Args:
+            body: BeautifulSoup Tag object representing the body element
+
+        Returns:
+            List of (text, tag_name) tuples
+        """
+        # Tags to ignore - inline elements that shouldn't break text flow
+        INLINE_TAGS = {
+            "a",
+            "abbr",
+            "acronym",
+            "b",
+            "bdo",
+            "big",
+            "br",
+            "button",
+            "cite",
+            "code",
+            "dfn",
+            "em",
+            "i",
+            "img",
+            "input",
+            "kbd",
+            "label",
+            "map",
+            "object",
+            "q",
+            "samp",
+            "script",
+            "select",
+            "small",
+            "span",
+            "strong",
+            "sub",
+            "sup",
+            "textarea",
+            "time",
+            "tt",
+            "var",
+        }
+
+        # Tags that typically contain meaningful headers
+        HEADER_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "header"}
+
+        chunks = []
+        current_text = []
+        chunk_index = 0
+
+        def should_break_chunk(tag: Tag) -> bool:
+            """Determine if a tag should cause a break in the current text chunk"""
+            return tag.name not in INLINE_TAGS and not (
+                tag.name == "p" and len(current_text) == 0
+            )
+
+        # Use deque for efficient push/pop operations
+        stack = deque([(body, False)])
+
+        while stack:
+            element, visited = stack.pop()
+
+            if visited:
+                # End of block element - flush accumulated text
+                if current_text and should_break_chunk(element):
+                    text = " ".join("".join(current_text).split())
+                    if text:
+                        tag_type = (
+                            "header" if element.name in HEADER_TAGS else "content"
+                        )
+                        chunks.append((chunk_index, text, tag_type, element))
+                        chunk_index += 1
+                    current_text = []
+                continue
+
+            if isinstance(element, NavigableString):
+                if str(element).strip():
+                    current_text.append(str(element).strip())
+                continue
+
+            # Pre-allocate children to avoid multiple list operations
+            children = list(element.children)
+            if not children:
+                continue
+
+            # Mark block for revisit after processing children
+            stack.append((element, True))
+
+            # Add children in reverse order for correct processing
+            for child in reversed(children):
+                if isinstance(child, (Tag, NavigableString)):
+                    stack.append((child, False))
+
+        # Handle any remaining text
+        if current_text:
+            text = " ".join("".join(current_text).split())
+            if text:
+                chunks.append((chunk_index, text, "content", body))
+
+        if min_word_threshold:
+            chunks = [
+                chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold
+            ]
+
+        return chunks
+
+    def _deprecated_extract_text_chunks(
+        self, soup: BeautifulSoup
+    ) -> List[Tuple[int, str, Tag]]:
+        """Common method for extracting text chunks"""
+        _text_cache = {}
+
+        def fast_text(element: Tag) -> str:
+            elem_id = id(element)
+            if elem_id in _text_cache:
+                return _text_cache[elem_id]
+            texts = []
+            for content in element.contents:
+                if isinstance(content, str):
+                    text = content.strip()
+                    if text:
+                        texts.append(text)
+            result = " ".join(texts)
+            _text_cache[elem_id] = result
+            return result
+
+        candidates = []
+        index = 0
+
+        def dfs(element):
+            nonlocal index
+            if isinstance(element, Tag):
+                if element.name in self.included_tags:
+                    if not self.is_excluded(element):
+                        text = fast_text(element)
+                        word_count = len(text.split())
+
+                        # Headers pass through with adjusted minimum
+                        if element.name in self.header_tags:
+                            if word_count >= 3:  # Minimal sanity check for headers
+                                candidates.append((index, text, element))
+                                index += 1
+                        # Regular content uses standard minimum
+                        elif word_count >= self.min_word_count:
+                            candidates.append((index, text, element))
+                            index += 1
+
+                for child in element.children:
+                    dfs(child)
+
+        dfs(soup.body if soup.body else soup)
+        return candidates
+
+    def is_excluded(self, tag: Tag) -> bool:
+        """Common method for exclusion logic"""
+        if tag.name in self.excluded_tags:
+            return True
+        class_id = " ".join(
+            filter(None, [" ".join(tag.get("class", [])), tag.get("id", "")])
+        )
+        return bool(self.negative_patterns.search(class_id))
+
+    def clean_element(self, tag: Tag) -> str:
+        """Common method for cleaning HTML elements with minimal overhead"""
+        if not tag or not isinstance(tag, Tag):
+            return ""
+
+        unwanted_tags = {"script", "style", "aside", "form", "iframe", "noscript"}
+        unwanted_attrs = {
+            "style",
+            "onclick",
+            "onmouseover",
+            "align",
+            "bgcolor",
+            "class",
+            "id",
+        }
+
+        # Use string builder pattern for better performance
+        builder = []
+
+        def render_tag(elem):
+            if not isinstance(elem, Tag):
+                if isinstance(elem, str):
+                    builder.append(elem.strip())
+                return
+
+            if elem.name in unwanted_tags:
+                return
+
+            # Start tag
+            builder.append(f"<{elem.name}")
+
+            # Add cleaned attributes
+            attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs}
+            for key, value in attrs.items():
+                builder.append(f' {key}="{value}"')
+
+            builder.append(">")
+
+            # Process children
+            for child in elem.children:
+                render_tag(child)
+
+            # Close tag
+            builder.append(f"</{elem.name}>")
+
+        try:
+            render_tag(tag)
+            return "".join(builder)
+        except Exception:
+            return str(tag)  # Fallback to original if anything fails
+
+
+class BM25ContentFilter(RelevantContentFilter):
+    """
+    Content filtering using BM25 algorithm with priority tag handling.
+
+    How it works:
+    1. Extracts page metadata with fallbacks.
+    2. Extracts text chunks from the body element.
+    3. Tokenizes the corpus and query.
+    4. Applies BM25 algorithm to calculate scores for each chunk.
+    5. Filters out chunks below the threshold.
+    6. Sorts chunks by score in descending order.
+    7. Returns the top N chunks.
+
+    Attributes:
+        user_query (str): User query for filtering (optional).
+        bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
+        language (str): Language for stemming (default: 'english').
+
+        Methods:
+            filter_content(self, html: str, min_word_threshold: int = None)
+    """
+
+    def __init__(
+        self,
+        user_query: str = None,
+        bm25_threshold: float = 1.0,
+        language: str = "english",
+    ):
+        """
+        Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
+
+        Note:
+        If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
+
+        Args:
+            user_query (str): User query for filtering (optional).
+            bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
+            language (str): Language for stemming (default: 'english').
+        """
+        super().__init__(user_query=user_query)
+        self.bm25_threshold = bm25_threshold
+        self.priority_tags = {
+            "h1": 5.0,
+            "h2": 4.0,
+            "h3": 3.0,
+            "title": 4.0,
+            "strong": 2.0,
+            "b": 1.5,
+            "em": 1.5,
+            "blockquote": 2.0,
+            "code": 2.0,
+            "pre": 1.5,
+            "th": 1.5,  # Table headers
+        }
+        self.stemmer = stemmer(language)
+
+    def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
+        """
+        Implements content filtering using BM25 algorithm with priority tag handling.
+
+            Note:
+        This method implements the filtering logic for the BM25ContentFilter class.
+        It takes HTML content as input and returns a list of filtered text chunks.
+
+        Args:
+            html (str): HTML content to be filtered.
+            min_word_threshold (int): Minimum word threshold for filtering (optional).
+
+        Returns:
+            List[str]: List of filtered text chunks.
+        """
+        if not html or not isinstance(html, str):
+            return []
+
+        soup = BeautifulSoup(html, "lxml")
+
+        # Check if body is present
+        if not soup.body:
+            # Wrap in body tag if missing
+            soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
+        body = soup.find("body")
+
+        query = self.extract_page_query(soup, body)
+
+        if not query:
+            return []
+            # return [self.clean_element(soup)]
+
+        candidates = self.extract_text_chunks(body, min_word_threshold)
+
+        if not candidates:
+            return []
+
+        # Tokenize corpus
+        # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates]
+        # tokenized_query = query.lower().split()
+
+        # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()]
+        #                 for _, chunk, _, _ in candidates]
+        # tokenized_query = [ps.stem(word) for word in query.lower().split()]
+
+        tokenized_corpus = [
+            [self.stemmer.stemWord(word) for word in chunk.lower().split()]
+            for _, chunk, _, _ in candidates
+        ]
+        tokenized_query = [
+            self.stemmer.stemWord(word) for word in query.lower().split()
+        ]
+
+        # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
+        #            for _, chunk, _, _ in candidates]
+        # tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())]
+
+        # Clean from stop words and noise
+        tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
+        tokenized_query = clean_tokens(tokenized_query)
+
+        bm25 = BM25Okapi(tokenized_corpus)
+        scores = bm25.get_scores(tokenized_query)
+
+        # Adjust scores with tag weights
+        adjusted_candidates = []
+        for score, (index, chunk, tag_type, tag) in zip(scores, candidates):
+            tag_weight = self.priority_tags.get(tag.name, 1.0)
+            adjusted_score = score * tag_weight
+            adjusted_candidates.append((adjusted_score, index, chunk, tag))
+
+        # Filter candidates by threshold
+        selected_candidates = [
+            (index, chunk, tag)
+            for adjusted_score, index, chunk, tag in adjusted_candidates
+            if adjusted_score >= self.bm25_threshold
+        ]
+
+        if not selected_candidates:
+            return []
+
+        # Sort selected candidates by original document order
+        selected_candidates.sort(key=lambda x: x[0])
+
+        return [self.clean_element(tag) for _, _, tag in selected_candidates]
+
+
+class PruningContentFilter(RelevantContentFilter):
+    """
+    Content filtering using pruning algorithm with dynamic threshold.
+
+    How it works:
+    1. Extracts page metadata with fallbacks.
+    2. Extracts text chunks from the body element.
+    3. Applies pruning algorithm to calculate scores for each chunk.
+    4. Filters out chunks below the threshold.
+    5. Sorts chunks by score in descending order.
+    6. Returns the top N chunks.
+
+    Attributes:
+        user_query (str): User query for filtering (optional), if not provided, falls back to page metadata.
+        min_word_threshold (int): Minimum word threshold for filtering (optional).
+        threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
+        threshold (float): Fixed threshold value (default: 0.48).
+
+        Methods:
+            filter_content(self, html: str, min_word_threshold: int = None):
+    """
+
+    def __init__(
+        self,
+        user_query: str = None,
+        min_word_threshold: int = None,
+        threshold_type: str = "fixed",
+        threshold: float = 0.48,
+    ):
+        """
+        Initializes the PruningContentFilter class, if not provided, falls back to page metadata.
+
+        Note:
+        If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
+
+        Args:
+            user_query (str): User query for filtering (optional).
+            min_word_threshold (int): Minimum word threshold for filtering (optional).
+            threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
+            threshold (float): Fixed threshold value (default: 0.48).
+        """
+        super().__init__(None)
+        self.min_word_threshold = min_word_threshold
+        self.threshold_type = threshold_type
+        self.threshold = threshold
+
+        # Add tag importance for dynamic threshold
+        self.tag_importance = {
+            "article": 1.5,
+            "main": 1.4,
+            "section": 1.3,
+            "p": 1.2,
+            "h1": 1.4,
+            "h2": 1.3,
+            "h3": 1.2,
+            "div": 0.7,
+            "span": 0.6,
+        }
+
+        # Metric configuration
+        self.metric_config = {
+            "text_density": True,
+            "link_density": True,
+            "tag_weight": True,
+            "class_id_weight": True,
+            "text_length": True,
+        }
+
+        self.metric_weights = {
+            "text_density": 0.4,
+            "link_density": 0.2,
+            "tag_weight": 0.2,
+            "class_id_weight": 0.1,
+            "text_length": 0.1,
+        }
+
+        self.tag_weights = {
+            "div": 0.5,
+            "p": 1.0,
+            "article": 1.5,
+            "section": 1.0,
+            "span": 0.3,
+            "li": 0.5,
+            "ul": 0.5,
+            "ol": 0.5,
+            "h1": 1.2,
+            "h2": 1.1,
+            "h3": 1.0,
+            "h4": 0.9,
+            "h5": 0.8,
+            "h6": 0.7,
+        }
+
+    def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
+        """
+        Implements content filtering using pruning algorithm with dynamic threshold.
+
+        Note:
+        This method implements the filtering logic for the PruningContentFilter class.
+        It takes HTML content as input and returns a list of filtered text chunks.
+
+        Args:
+            html (str): HTML content to be filtered.
+            min_word_threshold (int): Minimum word threshold for filtering (optional).
+
+        Returns:
+            List[str]: List of filtered text chunks.
+        """
+        if not html or not isinstance(html, str):
+            return []
+
+        soup = BeautifulSoup(html, "lxml")
+        if not soup.body:
+            soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
+
+        # Remove comments and unwanted tags
+        self._remove_comments(soup)
+        self._remove_unwanted_tags(soup)
+
+        # Prune tree starting from body
+        body = soup.find("body")
+        self._prune_tree(body)
+
+        # Extract remaining content as list of HTML strings
+        content_blocks = []
+        for element in body.children:
+            if isinstance(element, str) or not hasattr(element, "name"):
+                continue
+            if len(element.get_text(strip=True)) > 0:
+                content_blocks.append(str(element))
+
+        return content_blocks
+
+    def _remove_comments(self, soup):
+        """Removes HTML comments"""
+        for element in soup(text=lambda text: isinstance(text, Comment)):
+            element.extract()
+
+    def _remove_unwanted_tags(self, soup):
+        """Removes unwanted tags"""
+        for tag in self.excluded_tags:
+            for element in soup.find_all(tag):
+                element.decompose()
+
+    def _prune_tree(self, node):
+        """
+        Prunes the tree starting from the given node.
+
+        Args:
+            node (Tag): The node from which the pruning starts.
+        """
+        if not node or not hasattr(node, "name") or node.name is None:
+            return
+
+        text_len = len(node.get_text(strip=True))
+        tag_len = len(node.encode_contents().decode("utf-8"))
+        link_text_len = sum(
+            len(s.strip())
+            for s in (a.string for a in node.find_all("a", recursive=False))
+            if s
+        )
+
+        metrics = {
+            "node": node,
+            "tag_name": node.name,
+            "text_len": text_len,
+            "tag_len": tag_len,
+            "link_text_len": link_text_len,
+        }
+
+        score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len)
+
+        if self.threshold_type == "fixed":
+            should_remove = score < self.threshold
+        else:  # dynamic
+            tag_importance = self.tag_importance.get(node.name, 0.7)
+            text_ratio = text_len / tag_len if tag_len > 0 else 0
+            link_ratio = link_text_len / text_len if text_len > 0 else 1
+
+            threshold = self.threshold  # base threshold
+            if tag_importance > 1:
+                threshold *= 0.8
+            if text_ratio > 0.4:
+                threshold *= 0.9
+            if link_ratio > 0.6:
+                threshold *= 1.2
+
+            should_remove = score < threshold
+
+        if should_remove:
+            node.decompose()
+        else:
+            children = [child for child in node.children if hasattr(child, "name")]
+            for child in children:
+                self._prune_tree(child)
+
+    def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
+        """Computes the composite score"""
+        if self.min_word_threshold:
+            # Get raw text from metrics node - avoid extra processing
+            text = metrics["node"].get_text(strip=True)
+            word_count = text.count(" ") + 1
+            if word_count < self.min_word_threshold:
+                return -1.0  # Guaranteed removal
+        score = 0.0
+        total_weight = 0.0
+
+        if self.metric_config["text_density"]:
+            density = text_len / tag_len if tag_len > 0 else 0
+            score += self.metric_weights["text_density"] * density
+            total_weight += self.metric_weights["text_density"]
+
+        if self.metric_config["link_density"]:
+            density = 1 - (link_text_len / text_len if text_len > 0 else 0)
+            score += self.metric_weights["link_density"] * density
+            total_weight += self.metric_weights["link_density"]
+
+        if self.metric_config["tag_weight"]:
+            tag_score = self.tag_weights.get(metrics["tag_name"], 0.5)
+            score += self.metric_weights["tag_weight"] * tag_score
+            total_weight += self.metric_weights["tag_weight"]
+
+        if self.metric_config["class_id_weight"]:
+            class_score = self._compute_class_id_weight(metrics["node"])
+            score += self.metric_weights["class_id_weight"] * max(0, class_score)
+            total_weight += self.metric_weights["class_id_weight"]
+
+        if self.metric_config["text_length"]:
+            score += self.metric_weights["text_length"] * math.log(text_len + 1)
+            total_weight += self.metric_weights["text_length"]
+
+        return score / total_weight if total_weight > 0 else 0
+
+    def _compute_class_id_weight(self, node):
+        """Computes the class ID weight"""
+        class_id_score = 0
+        if "class" in node.attrs:
+            classes = " ".join(node["class"])
+            if self.negative_patterns.match(classes):
+                class_id_score -= 0.5
+        if "id" in node.attrs:
+            element_id = node["id"]
+            if self.negative_patterns.match(element_id):
+                class_id_score -= 0.5
+        return class_id_score
+
+
+class LLMContentFilter(RelevantContentFilter):
+    """Content filtering using LLMs to generate relevant markdown.
+
+    How it works:
+    1. Extracts page metadata with fallbacks.
+    2. Extracts text chunks from the body element.
+    3. Applies LLMs to generate markdown for each chunk.
+    4. Filters out chunks below the threshold.
+    5. Sorts chunks by score in descending order.
+    6. Returns the top N chunks.
+
+    Attributes:
+        llm_config (LLMConfig): LLM configuration object.
+        instruction (str): Instruction for LLM markdown generation
+        chunk_token_threshold (int): Chunk token threshold for splitting (default: 1e9).
+        overlap_rate (float): Overlap rate for chunking (default: 0.5).
+        word_token_rate (float): Word token rate for chunking (default: 0.2).
+        verbose (bool): Enable verbose logging (default: False).
+        logger (AsyncLogger): Custom logger for LLM operations (optional).
+    """
+    _UNWANTED_PROPS = {
+        'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
+        'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
+        'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+        'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+    }
+
+    def __init__(
+        self,
+        llm_config: "LLMConfig" = None,
+        instruction: str = None,
+        chunk_token_threshold: int = int(1e9),
+        overlap_rate: float = OVERLAP_RATE,
+        word_token_rate: float = WORD_TOKEN_RATE,
+        # char_token_rate: float = WORD_TOKEN_RATE * 5,
+        # chunk_mode: str = "char",
+        verbose: bool = False,
+        logger: Optional[AsyncLogger] = None,
+        ignore_cache: bool = True,
+        # Deprecated properties
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        base_url: Optional[str] = None,
+        api_base: Optional[str] = None,
+        extra_args: Dict = None,
+    ):
+        super().__init__(None)
+        self.provider = provider
+        self.api_token = api_token
+        self.base_url = base_url or api_base
+        self.llm_config = llm_config
+        self.instruction = instruction
+        self.chunk_token_threshold = chunk_token_threshold
+        self.overlap_rate = overlap_rate
+        self.word_token_rate = word_token_rate or WORD_TOKEN_RATE
+        # self.chunk_mode: str = chunk_mode
+        # self.char_token_rate = char_token_rate or word_token_rate / 5
+        # self.token_rate = word_token_rate if chunk_mode == "word" else self.char_token_rate
+        self.token_rate = word_token_rate or WORD_TOKEN_RATE
+        self.extra_args = extra_args or {}
+        self.ignore_cache = ignore_cache
+        self.verbose = verbose
+
+        # Setup logger with custom styling for LLM operations
+        if logger:
+            self.logger = logger
+        elif verbose:
+            self.logger = AsyncLogger(
+                verbose=verbose,
+                icons={
+                    **AsyncLogger.DEFAULT_ICONS,
+                    "LLM": "★",  # Star for LLM operations
+                    "CHUNK": "◈",  # Diamond for chunks
+                    "CACHE": "⚡",  # Lightning for cache operations
+                },
+                colors={
+                    **AsyncLogger.DEFAULT_COLORS,
+                    LogLevel.INFO: Fore.MAGENTA
+                    + Style.DIM,  # Dimmed purple for LLM ops
+                },
+            )
+        else:
+            self.logger = None
+
+        self.usages = []
+        self.total_usage = TokenUsage()
+    
+    def __setattr__(self, name, value):
+        """Handle attribute setting."""
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        sig = inspect.signature(self.__init__)
+        all_params = sig.parameters  # Dictionary of parameter names and their details
+
+        if name in self._UNWANTED_PROPS and value is not all_params[name].default:
+            raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        
+        super().__setattr__(name, value)  
+        
+    def _get_cache_key(self, html: str, instruction: str) -> str:
+        """Generate a unique cache key based on HTML and instruction"""
+        content = f"{html}{instruction}"
+        return hashlib.md5(content.encode()).hexdigest()
+
+    def _merge_chunks(self, text: str) -> List[str]:
+        """Split text into chunks with overlap using char or word mode."""
+        ov = int(self.chunk_token_threshold * self.overlap_rate)
+        sections = merge_chunks(
+            docs=[text],
+            target_size=self.chunk_token_threshold,
+            overlap=ov,
+            word_token_ratio=self.word_token_rate,
+        )
+        return sections
+
+    def filter_content(self, html: str, ignore_cache: bool = True) -> List[str]:
+        if not html or not isinstance(html, str):
+            return []
+
+        if self.logger:
+            self.logger.info(
+                "Starting LLM markdown content filtering process",
+                tag="LLM",
+                params={"provider": self.llm_config.provider},
+                colors={"provider": Fore.CYAN},
+            )
+
+        # Cache handling
+        cache_dir = Path(get_home_folder()) / "llm_cache" / "content_filter"
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        cache_key = self._get_cache_key(html, self.instruction or "")
+        cache_file = cache_dir / f"{cache_key}.json"
+
+        # if ignore_cache == None:
+        ignore_cache = self.ignore_cache
+
+        if not ignore_cache and cache_file.exists():
+            if self.logger:
+                self.logger.info("Found  cached markdown result", tag="CACHE")
+            try:
+                with cache_file.open("r") as f:
+                    cached_data = json.load(f)
+                    usage = TokenUsage(**cached_data["usage"])
+                    self.usages.append(usage)
+                    self.total_usage.completion_tokens += usage.completion_tokens
+                    self.total_usage.prompt_tokens += usage.prompt_tokens
+                    self.total_usage.total_tokens += usage.total_tokens
+                    return cached_data["blocks"]
+            except Exception as e:
+                if self.logger:
+                    self.logger.error(
+                        f"LLM markdown: Cache read error: {str(e)}", tag="CACHE"
+                    )
+
+        # Split into chunks
+        html_chunks = self._merge_chunks(html)
+        if self.logger:
+            self.logger.info(
+                "LLM markdown: Split content into {chunk_count} chunks",
+                tag="CHUNK",
+                params={"chunk_count": len(html_chunks)},
+                colors={"chunk_count": Fore.YELLOW},
+            )
+
+        start_time = time.time()
+
+        # Process chunks in parallel
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            futures = []
+            for i, chunk in enumerate(html_chunks):
+                if self.logger:
+                    self.logger.debug(
+                        "LLM markdown: Processing chunk {chunk_num}/{total_chunks}",
+                        tag="CHUNK",
+                        params={"chunk_num": i + 1, "total_chunks": len(html_chunks)},
+                    )
+
+                prompt_variables = {
+                    "HTML": escape_json_string(sanitize_html(chunk)),
+                    "REQUEST": self.instruction
+                    or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content.",
+                }
+
+                prompt = PROMPT_FILTER_CONTENT
+                for var, value in prompt_variables.items():
+                    prompt = prompt.replace("{" + var + "}", value)
+
+                def _proceed_with_chunk(
+                    provider: str,
+                    prompt: str,
+                    api_token: str,
+                    base_url: Optional[str] = None,
+                    extra_args: Dict = {},
+                ) -> List[str]:
+                    if self.logger:
+                        self.logger.info(
+                            "LLM Markdown: Processing chunk {chunk_num}",
+                            tag="CHUNK",
+                            params={"chunk_num": i + 1},
+                        )
+                    return perform_completion_with_backoff(
+                        provider,
+                        prompt,
+                        api_token,
+                        base_url=base_url,
+                        extra_args=extra_args,
+                    )
+
+                future = executor.submit(
+                    _proceed_with_chunk,
+                    self.llm_config.provider,
+                    prompt,
+                    self.llm_config.api_token,
+                    self.llm_config.base_url,
+                    self.extra_args,
+                )
+                futures.append((i, future))
+
+            # Collect results in order
+            ordered_results = []
+            for i, future in sorted(futures):
+                try:
+                    response = future.result()
+
+                    # Track usage
+                    usage = TokenUsage(
+                        completion_tokens=response.usage.completion_tokens,
+                        prompt_tokens=response.usage.prompt_tokens,
+                        total_tokens=response.usage.total_tokens,
+                        completion_tokens_details=(
+                            response.usage.completion_tokens_details.__dict__
+                            if response.usage.completion_tokens_details
+                            else {}
+                        ),
+                        prompt_tokens_details=(
+                            response.usage.prompt_tokens_details.__dict__
+                            if response.usage.prompt_tokens_details
+                            else {}
+                        ),
+                    )
+                    self.usages.append(usage)
+                    self.total_usage.completion_tokens += usage.completion_tokens
+                    self.total_usage.prompt_tokens += usage.prompt_tokens
+                    self.total_usage.total_tokens += usage.total_tokens
+
+                    blocks = extract_xml_data(
+                        ["content"], response.choices[0].message.content
+                    )["content"]
+                    if blocks:
+                        ordered_results.append(blocks)
+                        if self.logger:
+                            self.logger.success(
+                                "LLM markdown: Successfully processed chunk {chunk_num}",
+                                tag="CHUNK",
+                                params={"chunk_num": i + 1},
+                            )
+                except Exception as e:
+                    if self.logger:
+                        self.logger.error(
+                            "LLM markdown: Error processing chunk {chunk_num}: {error}",
+                            tag="CHUNK",
+                            params={"chunk_num": i + 1, "error": str(e)},
+                        )
+
+        end_time = time.time()
+        if self.logger:
+            self.logger.success(
+                "LLM markdown: Completed processing in {time:.2f}s",
+                tag="LLM",
+                params={"time": end_time - start_time},
+                colors={"time": Fore.YELLOW},
+            )
+
+        result = ordered_results if ordered_results else []
+
+        # Cache the final result
+        cache_data = {"blocks": result, "usage": self.total_usage.__dict__}
+        with cache_file.open("w") as f:
+            json.dump(cache_data, f)
+            if self.logger:
+                self.logger.info("Cached results for future use", tag="CACHE")
+
+        return result
+
+    def show_usage(self) -> None:
+        """Print usage statistics"""
+        print("\n=== Token Usage Summary ===")
+        print(f"{'Type':<15} {'Count':>12}")
+        print("-" * 30)
+        print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
+        print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
+        print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
+
+        if self.usages:
+            print("\n=== Usage History ===")
+            print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
+            print("-" * 48)
+            for i, usage in enumerate(self.usages, 1):
+                print(
+                    f"{i:<10} {usage.completion_tokens:>12,} "
+                    f"{usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
+                )
+
+```
+
+
+## File: crawl4ai/markdown_generation_strategy.py
+
+```py
+from abc import ABC, abstractmethod
+from typing import Optional, Dict, Any, Tuple
+from .models import MarkdownGenerationResult
+from .html2text import CustomHTML2Text
+# from .types import RelevantContentFilter
+from .content_filter_strategy import RelevantContentFilter
+import re
+from urllib.parse import urljoin
+
+# Pre-compile the regex pattern
+LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
+
+
+def fast_urljoin(base: str, url: str) -> str:
+    """Fast URL joining for common cases."""
+    if url.startswith(("http://", "https://", "mailto:", "//")):
+        return url
+    if url.startswith("/"):
+        # Handle absolute paths
+        if base.endswith("/"):
+            return base[:-1] + url
+        return base + url
+    return urljoin(base, url)
+
+
+class MarkdownGenerationStrategy(ABC):
+    """Abstract base class for markdown generation strategies."""
+
+    def __init__(
+        self,
+        content_filter: Optional[RelevantContentFilter] = None,
+        options: Optional[Dict[str, Any]] = None,
+        verbose: bool = False,
+        content_source: str = "cleaned_html",
+    ):
+        self.content_filter = content_filter
+        self.options = options or {}
+        self.verbose = verbose
+        self.content_source = content_source
+
+    @abstractmethod
+    def generate_markdown(
+        self,
+        input_html: str,
+        base_url: str = "",
+        html2text_options: Optional[Dict[str, Any]] = None,
+        content_filter: Optional[RelevantContentFilter] = None,
+        citations: bool = True,
+        **kwargs,
+    ) -> MarkdownGenerationResult:
+        """Generate markdown from the selected input HTML."""
+        pass
+
+
+class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
+    """
+    Default implementation of markdown generation strategy.
+
+    How it works:
+    1. Generate raw markdown from cleaned HTML.
+    2. Convert links to citations.
+    3. Generate fit markdown if content filter is provided.
+    4. Return MarkdownGenerationResult.
+
+    Args:
+        content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
+        options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
+        content_source (str): Source of content to generate markdown from. Options: "cleaned_html", "raw_html", "fit_html". Defaults to "cleaned_html".
+
+    Returns:
+        MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
+    """
+
+    def __init__(
+        self,
+        content_filter: Optional[RelevantContentFilter] = None,
+        options: Optional[Dict[str, Any]] = None,
+        content_source: str = "cleaned_html",
+    ):
+        super().__init__(content_filter, options, verbose=False, content_source=content_source)
+
+    def convert_links_to_citations(
+        self, markdown: str, base_url: str = ""
+    ) -> Tuple[str, str]:
+        """
+        Convert links in markdown to citations.
+
+        How it works:
+        1. Find all links in the markdown.
+        2. Convert links to citations.
+        3. Return converted markdown and references markdown.
+
+        Note:
+        This function uses a regex pattern to find links in markdown.
+
+        Args:
+            markdown (str): Markdown text.
+            base_url (str): Base URL for URL joins.
+
+        Returns:
+            Tuple[str, str]: Converted markdown and references markdown.
+        """
+        link_map = {}
+        url_cache = {}  # Cache for URL joins
+        parts = []
+        last_end = 0
+        counter = 1
+
+        for match in LINK_PATTERN.finditer(markdown):
+            parts.append(markdown[last_end : match.start()])
+            text, url, title = match.groups()
+
+            # Use cached URL if available, otherwise compute and cache
+            if base_url and not url.startswith(("http://", "https://", "mailto:")):
+                if url not in url_cache:
+                    url_cache[url] = fast_urljoin(base_url, url)
+                url = url_cache[url]
+
+            if url not in link_map:
+                desc = []
+                if title:
+                    desc.append(title)
+                if text and text != title:
+                    desc.append(text)
+                link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
+                counter += 1
+
+            num = link_map[url][0]
+            parts.append(
+                f"{text}⟨{num}⟩"
+                if not match.group(0).startswith("!")
+                else f"![{text}⟨{num}⟩]"
+            )
+            last_end = match.end()
+
+        parts.append(markdown[last_end:])
+        converted_text = "".join(parts)
+
+        # Pre-build reference strings
+        references = ["\n\n## References\n\n"]
+        references.extend(
+            f"⟨{num}⟩ {url}{desc}\n"
+            for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
+        )
+
+        return converted_text, "".join(references)
+
+    def generate_markdown(
+        self,
+        input_html: str,
+        base_url: str = "",
+        html2text_options: Optional[Dict[str, Any]] = None,
+        options: Optional[Dict[str, Any]] = None,
+        content_filter: Optional[RelevantContentFilter] = None,
+        citations: bool = True,
+        **kwargs,
+    ) -> MarkdownGenerationResult:
+        """
+        Generate markdown with citations from the provided input HTML.
+
+        How it works:
+        1. Generate raw markdown from the input HTML.
+        2. Convert links to citations.
+        3. Generate fit markdown if content filter is provided.
+        4. Return MarkdownGenerationResult.
+
+        Args:
+            input_html (str): The HTML content to process (selected based on content_source).
+            base_url (str): Base URL for URL joins.
+            html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
+            options (Optional[Dict[str, Any]]): Additional options for markdown generation.
+            content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
+            citations (bool): Whether to generate citations.
+
+        Returns:
+            MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
+        """
+        try:
+            # Initialize HTML2Text with default options for better conversion
+            h = CustomHTML2Text(baseurl=base_url)
+            default_options = {
+                "body_width": 0,  # Disable text wrapping
+                "ignore_emphasis": False,
+                "ignore_links": False,
+                "ignore_images": False,
+                "protect_links": False,
+                "single_line_break": True,
+                "mark_code": True,
+                "escape_snob": False,
+            }
+
+            # Update with custom options if provided
+            if html2text_options:
+                default_options.update(html2text_options)
+            elif options:
+                default_options.update(options)
+            elif self.options:
+                default_options.update(self.options)
+
+            h.update_params(**default_options)
+
+            # Ensure we have valid input
+            if not input_html:
+                input_html = ""
+            elif not isinstance(input_html, str):
+                input_html = str(input_html)
+
+            # Generate raw markdown
+            try:
+                raw_markdown = h.handle(input_html)
+            except Exception as e:
+                raw_markdown = f"Error converting HTML to markdown: {str(e)}"
+
+            raw_markdown = raw_markdown.replace("    ```", "```")
+
+            # Convert links to citations
+            markdown_with_citations: str = raw_markdown
+            references_markdown: str = ""
+            if citations:
+                try:
+                    (
+                        markdown_with_citations,
+                        references_markdown,
+                    ) = self.convert_links_to_citations(raw_markdown, base_url)
+                except Exception as e:
+                    markdown_with_citations = raw_markdown
+                    references_markdown = f"Error generating citations: {str(e)}"
+
+            # Generate fit markdown if content filter is provided
+            fit_markdown: Optional[str] = ""
+            filtered_html: Optional[str] = ""
+            if content_filter or self.content_filter:
+                try:
+                    content_filter = content_filter or self.content_filter
+                    filtered_html = content_filter.filter_content(input_html)
+                    filtered_html = "\n".join(
+                        "<div>{}</div>".format(s) for s in filtered_html
+                    )
+                    fit_markdown = h.handle(filtered_html)
+                except Exception as e:
+                    fit_markdown = f"Error generating fit markdown: {str(e)}"
+                    filtered_html = ""
+
+            return MarkdownGenerationResult(
+                raw_markdown=raw_markdown or "",
+                markdown_with_citations=markdown_with_citations or "",
+                references_markdown=references_markdown or "",
+                fit_markdown=fit_markdown or "",
+                fit_html=filtered_html or "",
+            )
+        except Exception as e:
+            # If anything fails, return empty strings with error message
+            error_msg = f"Error in markdown generation: {str(e)}"
+            return MarkdownGenerationResult(
+                raw_markdown=error_msg,
+                markdown_with_citations=error_msg,
+                references_markdown="",
+                fit_markdown="",
+                fit_html="",
+            )
+
+```
+
+
+## File: crawl4ai/browser_manager.py
+
+```py
+import asyncio
+import time
+from typing import List, Optional
+import os
+import sys
+import shutil
+import tempfile
+import subprocess
+from playwright.async_api import BrowserContext
+import hashlib
+from .js_snippet import load_js_script
+from .config import DOWNLOAD_PAGE_TIMEOUT
+from .async_configs import BrowserConfig, CrawlerRunConfig
+from playwright_stealth import StealthConfig
+from .utils import get_chromium_path
+
+stealth_config = StealthConfig(
+    webdriver=True,
+    chrome_app=True,
+    chrome_csi=True,
+    chrome_load_times=True,
+    chrome_runtime=True,
+    navigator_languages=True,
+    navigator_plugins=True,
+    navigator_permissions=True,
+    webgl_vendor=True,
+    outerdimensions=True,
+    navigator_hardware_concurrency=True,
+    media_codecs=True,
+)
+
+BROWSER_DISABLE_OPTIONS = [
+    "--disable-background-networking",
+    "--disable-background-timer-throttling",
+    "--disable-backgrounding-occluded-windows",
+    "--disable-breakpad",
+    "--disable-client-side-phishing-detection",
+    "--disable-component-extensions-with-background-pages",
+    "--disable-default-apps",
+    "--disable-extensions",
+    "--disable-features=TranslateUI",
+    "--disable-hang-monitor",
+    "--disable-ipc-flooding-protection",
+    "--disable-popup-blocking",
+    "--disable-prompt-on-repost",
+    "--disable-sync",
+    "--force-color-profile=srgb",
+    "--metrics-recording-only",
+    "--no-first-run",
+    "--password-store=basic",
+    "--use-mock-keychain",
+]
+
+
+class ManagedBrowser:
+    """
+    Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
+
+    Attributes:
+        browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                            Default: "chromium".
+        user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                     temporary directory may be used. Default: None.
+        headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                         Default: True.
+        browser_process (subprocess.Popen): The process object for the browser.
+        temp_dir (str): Temporary directory for user data if not provided.
+        debugging_port (int): Port for debugging the browser.
+        host (str): Host for debugging the browser.
+
+        Methods:
+            start(): Starts the browser process and returns the CDP endpoint URL.
+            _get_browser_path(): Returns the browser executable path based on OS and browser type.
+            _get_browser_args(): Returns browser-specific command line arguments.
+            _get_user_data_dir(): Returns the user data directory path.
+            _cleanup(): Terminates the browser process and removes the temporary directory.
+            create_profile(): Static method to create a user profile by launching a browser for user interaction.
+    """
+
+    browser_type: str
+    user_data_dir: str
+    headless: bool
+    browser_process: subprocess.Popen
+    temp_dir: str
+    debugging_port: int
+    host: str
+
+    def __init__(
+        self,
+        browser_type: str = "chromium",
+        user_data_dir: Optional[str] = None,
+        headless: bool = False,
+        logger=None,
+        host: str = "localhost",
+        debugging_port: int = 9222,
+        cdp_url: Optional[str] = None, 
+        browser_config: Optional[BrowserConfig] = None,
+    ):
+        """
+        Initialize the ManagedBrowser instance.
+
+        Args:
+            browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                                Default: "chromium".
+            user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                         temporary directory may be used. Default: None.
+            headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                             Default: True.
+            logger (logging.Logger): Logger instance for logging messages. Default: None.
+            host (str): Host for debugging the browser. Default: "localhost".
+            debugging_port (int): Port for debugging the browser. Default: 9222.
+            cdp_url (str or None): CDP URL to connect to the browser. Default: None.
+            browser_config (BrowserConfig): Configuration object containing all browser settings. Default: None.
+        """
+        self.browser_type = browser_config.browser_type
+        self.user_data_dir = browser_config.user_data_dir
+        self.headless = browser_config.headless
+        self.browser_process = None
+        self.temp_dir = None
+        self.debugging_port = browser_config.debugging_port
+        self.host = browser_config.host
+        self.logger = logger
+        self.shutting_down = False
+        self.cdp_url = browser_config.cdp_url
+        self.browser_config = browser_config
+
+    async def start(self) -> str:
+        """
+        Starts the browser process or returns CDP endpoint URL.
+        If cdp_url is provided, returns it directly.
+        If user_data_dir is not provided for local browser, creates a temporary directory.
+        
+        Returns:
+            str: CDP endpoint URL
+        """
+        # If CDP URL provided, just return it
+        if self.cdp_url:
+            return self.cdp_url
+
+        # Create temp dir if needed
+        if not self.user_data_dir:
+            self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
+            self.user_data_dir = self.temp_dir
+
+        # Get browser path and args based on OS and browser type
+        # browser_path = self._get_browser_path()
+        args = await self._get_browser_args()
+        
+        if self.browser_config.extra_args:
+            args.extend(self.browser_config.extra_args)
+
+        # Start browser process
+        try:
+            # Use DETACHED_PROCESS flag on Windows to fully detach the process
+            # On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
+            if sys.platform == "win32":
+                self.browser_process = subprocess.Popen(
+                    args, 
+                    stdout=subprocess.PIPE, 
+                    stderr=subprocess.PIPE,
+                    creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
+                )
+            else:
+                self.browser_process = subprocess.Popen(
+                    args, 
+                    stdout=subprocess.PIPE, 
+                    stderr=subprocess.PIPE,
+                    preexec_fn=os.setpgrp  # Start in a new process group
+                )
+                
+            # We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
+            await asyncio.sleep(0.5)  # Give browser time to start
+            await self._initial_startup_check()
+            await asyncio.sleep(2)  # Give browser time to start
+            return f"http://{self.host}:{self.debugging_port}"
+        except Exception as e:
+            await self.cleanup()
+            raise Exception(f"Failed to start browser: {e}")
+
+    async def _initial_startup_check(self):
+        """
+        Perform a quick check to make sure the browser started successfully.
+        This only runs once at startup rather than continuously monitoring.
+        """
+        if not self.browser_process:
+            return
+            
+        # Check that process started without immediate termination
+        await asyncio.sleep(0.5)
+        if self.browser_process.poll() is not None:
+            # Process already terminated
+            stdout, stderr = b"", b""
+            try:
+                stdout, stderr = self.browser_process.communicate(timeout=0.5)
+            except subprocess.TimeoutExpired:
+                pass
+                
+            self.logger.error(
+                message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
+                tag="ERROR",
+                params={
+                    "code": self.browser_process.returncode,
+                    "stdout": stdout.decode() if stdout else "",
+                    "stderr": stderr.decode() if stderr else "",
+                },
+            )
+    
+    async def _monitor_browser_process(self):
+        """
+        Monitor the browser process for unexpected termination.
+
+        How it works:
+        1. Read stdout and stderr from the browser process.
+        2. If the process has terminated, log the error message and terminate the browser.
+        3. If the shutting_down flag is set, log the normal termination message.
+        4. If any other error occurs, log the error message.
+
+        Note: This method should be called in a separate task to avoid blocking the main event loop.
+        This is DEPRECATED and should not be used for builtin browsers that need to outlive the Python process.
+        """
+        if self.browser_process:
+            try:
+                stdout, stderr = await asyncio.gather(
+                    asyncio.to_thread(self.browser_process.stdout.read),
+                    asyncio.to_thread(self.browser_process.stderr.read),
+                )
+
+                # Check shutting_down flag BEFORE logging anything
+                if self.browser_process.poll() is not None:
+                    if not self.shutting_down:
+                        self.logger.error(
+                            message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
+                            tag="ERROR",
+                            params={
+                                "code": self.browser_process.returncode,
+                                "stdout": stdout.decode(),
+                                "stderr": stderr.decode(),
+                            },
+                        )
+                        await self.cleanup()
+                    else:
+                        self.logger.info(
+                            message="Browser process terminated normally | Code: {code}",
+                            tag="INFO",
+                            params={"code": self.browser_process.returncode},
+                        )
+            except Exception as e:
+                if not self.shutting_down:
+                    self.logger.error(
+                        message="Error monitoring browser process: {error}",
+                        tag="ERROR",
+                        params={"error": str(e)},
+                    )
+
+    def _get_browser_path_WIP(self) -> str:
+        """Returns the browser executable path based on OS and browser type"""
+        if sys.platform == "darwin":  # macOS
+            paths = {
+                "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+                "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
+                "webkit": "/Applications/Safari.app/Contents/MacOS/Safari",
+            }
+        elif sys.platform == "win32":  # Windows
+            paths = {
+                "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
+                "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
+                "webkit": None,  # WebKit not supported on Windows
+            }
+        else:  # Linux
+            paths = {
+                "chromium": "google-chrome",
+                "firefox": "firefox",
+                "webkit": None,  # WebKit not supported on Linux
+            }
+
+        return paths.get(self.browser_type)
+
+    async def _get_browser_path(self) -> str:
+        browser_path = await get_chromium_path(self.browser_type)
+        return browser_path
+
+    async def _get_browser_args(self) -> List[str]:
+        """Returns browser-specific command line arguments"""
+        base_args = [await self._get_browser_path()]
+
+        if self.browser_type == "chromium":
+            args = [
+                f"--remote-debugging-port={self.debugging_port}",
+                f"--user-data-dir={self.user_data_dir}",
+            ]
+            if self.headless:
+                args.append("--headless=new")
+        elif self.browser_type == "firefox":
+            args = [
+                "--remote-debugging-port",
+                str(self.debugging_port),
+                "--profile",
+                self.user_data_dir,
+            ]
+            if self.headless:
+                args.append("--headless")
+        else:
+            raise NotImplementedError(f"Browser type {self.browser_type} not supported")
+
+        return base_args + args
+
+    async def cleanup(self):
+        """Cleanup browser process and temporary directory"""
+        # Set shutting_down flag BEFORE any termination actions
+        self.shutting_down = True
+
+        if self.browser_process:
+            try:
+                # For builtin browsers that should persist, we should check if it's a detached process
+                # Only terminate if we have proper control over the process
+                if not self.browser_process.poll():
+                    # Process is still running
+                    self.browser_process.terminate()
+                    # Wait for process to end gracefully
+                    for _ in range(10):  # 10 attempts, 100ms each
+                        if self.browser_process.poll() is not None:
+                            break
+                        await asyncio.sleep(0.1)
+
+                    # Force kill if still running
+                    if self.browser_process.poll() is None:
+                        if sys.platform == "win32":
+                            # On Windows we might need taskkill for detached processes
+                            try:
+                                subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)])
+                            except Exception:
+                                self.browser_process.kill()
+                        else:
+                            self.browser_process.kill()
+                        await asyncio.sleep(0.1)  # Brief wait for kill to take effect
+
+            except Exception as e:
+                self.logger.error(
+                    message="Error terminating browser: {error}",
+                    tag="ERROR", 
+                    params={"error": str(e)},
+                )
+
+        if self.temp_dir and os.path.exists(self.temp_dir):
+            try:
+                shutil.rmtree(self.temp_dir)
+            except Exception as e:
+                self.logger.error(
+                    message="Error removing temporary directory: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)},
+                )
+                
+    # These methods have been moved to BrowserProfiler class
+    @staticmethod
+    async def create_profile(browser_config=None, profile_name=None, logger=None):
+        """
+        This method has been moved to the BrowserProfiler class.
+        
+        Creates a browser profile by launching a browser for interactive user setup
+        and waits until the user closes it. The profile is stored in a directory that
+        can be used later with BrowserConfig.user_data_dir.
+        
+        Please use BrowserProfiler.create_profile() instead.
+        
+        Example:
+            ```python
+            from crawl4ai.browser_profiler import BrowserProfiler
+            
+            profiler = BrowserProfiler()
+            profile_path = await profiler.create_profile(profile_name="my-login-profile")
+            ```
+        """
+        from .browser_profiler import BrowserProfiler
+        
+        # Create a BrowserProfiler instance and delegate to it
+        profiler = BrowserProfiler(logger=logger)
+        return await profiler.create_profile(profile_name=profile_name, browser_config=browser_config)
+    
+    @staticmethod
+    def list_profiles():
+        """
+        This method has been moved to the BrowserProfiler class.
+        
+        Lists all available browser profiles in the Crawl4AI profiles directory.
+        
+        Please use BrowserProfiler.list_profiles() instead.
+        
+        Example:
+            ```python
+            from crawl4ai.browser_profiler import BrowserProfiler
+            
+            profiler = BrowserProfiler()
+            profiles = profiler.list_profiles()
+            ```
+        """
+        from .browser_profiler import BrowserProfiler
+        
+        # Create a BrowserProfiler instance and delegate to it
+        profiler = BrowserProfiler()
+        return profiler.list_profiles()
+        
+    @staticmethod
+    def delete_profile(profile_name_or_path):
+        """
+        This method has been moved to the BrowserProfiler class.
+        
+        Delete a browser profile by name or path.
+        
+        Please use BrowserProfiler.delete_profile() instead.
+        
+        Example:
+            ```python
+            from crawl4ai.browser_profiler import BrowserProfiler
+            
+            profiler = BrowserProfiler()
+            success = profiler.delete_profile("my-profile")
+            ```
+        """
+        from .browser_profiler import BrowserProfiler
+        
+        # Create a BrowserProfiler instance and delegate to it
+        profiler = BrowserProfiler()
+        return profiler.delete_profile(profile_name_or_path)
+
+
+
+
+class BrowserManager:
+    """
+    Manages the browser instance and context.
+
+    Attributes:
+        config (BrowserConfig): Configuration object containing all browser settings
+        logger: Logger instance for recording events and errors
+        browser (Browser): The browser instance
+        default_context (BrowserContext): The default browser context
+        managed_browser (ManagedBrowser): The managed browser instance
+        playwright (Playwright): The Playwright instance
+        sessions (dict): Dictionary to store session information
+        session_ttl (int): Session timeout in seconds
+    """
+
+    _playwright_instance = None
+    
+    @classmethod
+    async def get_playwright(cls):
+        from playwright.async_api import async_playwright
+        cls._playwright_instance = await async_playwright().start()
+        return cls._playwright_instance    
+
+    def __init__(self, browser_config: BrowserConfig, logger=None):
+        """
+        Initialize the BrowserManager with a browser configuration.
+
+        Args:
+            browser_config (BrowserConfig): Configuration object containing all browser settings
+            logger: Logger instance for recording events and errors
+        """
+        self.config: BrowserConfig = browser_config
+        self.logger = logger
+
+        # Browser state
+        self.browser = None
+        self.default_context = None
+        self.managed_browser = None
+        self.playwright = None
+
+        # Session management
+        self.sessions = {}
+        self.session_ttl = 1800  # 30 minutes
+
+        # Keep track of contexts by a "config signature," so each unique config reuses a single context
+        self.contexts_by_config = {}
+        self._contexts_lock = asyncio.Lock() 
+
+        # Initialize ManagedBrowser if needed
+        if self.config.use_managed_browser:
+            self.managed_browser = ManagedBrowser(
+                browser_type=self.config.browser_type,
+                user_data_dir=self.config.user_data_dir,
+                headless=self.config.headless,
+                logger=self.logger,
+                debugging_port=self.config.debugging_port,
+                cdp_url=self.config.cdp_url,
+                browser_config=self.config,
+            )
+
+    async def start(self):
+        """
+        Start the browser instance and set up the default context.
+
+        How it works:
+        1. Check if Playwright is already initialized.
+        2. If not, initialize Playwright.
+        3. If managed browser is used, start it and connect to the CDP endpoint.
+        4. If managed browser is not used, launch the browser and set up the default context.
+
+        Note: This method should be called in a separate task to avoid blocking the main event loop.
+        """
+        if self.playwright is not None:
+            await self.close()
+            
+        from playwright.async_api import async_playwright
+
+        self.playwright = await async_playwright().start()
+
+        if self.config.cdp_url or self.config.use_managed_browser:
+            self.config.use_managed_browser = True
+            cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
+            self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
+            contexts = self.browser.contexts
+            if contexts:
+                self.default_context = contexts[0]
+            else:
+                self.default_context = await self.create_browser_context()
+            await self.setup_context(self.default_context)
+        else:
+            browser_args = self._build_browser_args()
+
+            # Launch appropriate browser type
+            if self.config.browser_type == "firefox":
+                self.browser = await self.playwright.firefox.launch(**browser_args)
+            elif self.config.browser_type == "webkit":
+                self.browser = await self.playwright.webkit.launch(**browser_args)
+            else:
+                self.browser = await self.playwright.chromium.launch(**browser_args)
+
+            self.default_context = self.browser
+
+
+    def _build_browser_args(self) -> dict:
+        """Build browser launch arguments from config."""
+        args = [
+            "--disable-gpu",
+            "--disable-gpu-compositing",
+            "--disable-software-rasterizer",
+            "--no-sandbox",
+            "--disable-dev-shm-usage",
+            "--no-first-run",
+            "--no-default-browser-check",
+            "--disable-infobars",
+            "--window-position=0,0",
+            "--ignore-certificate-errors",
+            "--ignore-certificate-errors-spki-list",
+            "--disable-blink-features=AutomationControlled",
+            "--window-position=400,0",
+            "--disable-renderer-backgrounding",
+            "--disable-ipc-flooding-protection",
+            "--force-color-profile=srgb",
+            "--mute-audio",
+            "--disable-background-timer-throttling",
+            # "--single-process",
+            f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
+        ]
+
+        if self.config.light_mode:
+            args.extend(BROWSER_DISABLE_OPTIONS)
+
+        if self.config.text_mode:
+            args.extend(
+                [
+                    "--blink-settings=imagesEnabled=false",
+                    "--disable-remote-fonts",
+                    "--disable-images",
+                    "--disable-javascript",
+                    "--disable-software-rasterizer",
+                    "--disable-dev-shm-usage",
+                ]
+            )
+
+        if self.config.extra_args:
+            args.extend(self.config.extra_args)
+
+        # Deduplicate args
+        args = list(dict.fromkeys(args))
+        
+        browser_args = {"headless": self.config.headless, "args": args}
+
+        if self.config.chrome_channel:
+            browser_args["channel"] = self.config.chrome_channel
+
+        if self.config.accept_downloads:
+            browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
+                os.getcwd(), "downloads"
+            )
+            os.makedirs(browser_args["downloads_path"], exist_ok=True)
+
+        if self.config.proxy or self.config.proxy_config:
+            from playwright.async_api import ProxySettings
+
+            proxy_settings = (
+                ProxySettings(server=self.config.proxy)
+                if self.config.proxy
+                else ProxySettings(
+                    server=self.config.proxy_config.server,
+                    username=self.config.proxy_config.username,
+                    password=self.config.proxy_config.password,
+                )
+            )
+            browser_args["proxy"] = proxy_settings
+
+        return browser_args
+
+    async def setup_context(
+        self,
+        context: BrowserContext,
+        crawlerRunConfig: CrawlerRunConfig = None,
+        is_default=False,
+    ):
+        """
+        Set up a browser context with the configured options.
+
+        How it works:
+        1. Set extra HTTP headers if provided.
+        2. Add cookies if provided.
+        3. Load storage state if provided.
+        4. Accept downloads if enabled.
+        5. Set default timeouts for navigation and download.
+        6. Set user agent if provided.
+        7. Set browser hints if provided.
+        8. Set proxy if provided.
+        9. Set downloads path if provided.
+        10. Set storage state if provided.
+        11. Set cache if provided.
+        12. Set extra HTTP headers if provided.
+        13. Add cookies if provided.
+        14. Set default timeouts for navigation and download if enabled.
+        15. Set user agent if provided.
+        16. Set browser hints if provided.
+
+        Args:
+            context (BrowserContext): The browser context to set up
+            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
+            is_default (bool): Flag indicating if this is the default context
+        Returns:
+            None
+        """
+        if self.config.headers:
+            await context.set_extra_http_headers(self.config.headers)
+
+        if self.config.cookies:
+            await context.add_cookies(self.config.cookies)
+
+        if self.config.storage_state:
+            await context.storage_state(path=None)
+
+        if self.config.accept_downloads:
+            context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
+            context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
+            if self.config.downloads_path:
+                context._impl_obj._options["accept_downloads"] = True
+                context._impl_obj._options[
+                    "downloads_path"
+                ] = self.config.downloads_path
+
+        # Handle user agent and browser hints
+        if self.config.user_agent:
+            combined_headers = {
+                "User-Agent": self.config.user_agent,
+                "sec-ch-ua": self.config.browser_hint,
+            }
+            combined_headers.update(self.config.headers)
+            await context.set_extra_http_headers(combined_headers)
+
+        # Add default cookie
+        await context.add_cookies(
+            [
+                {
+                    "name": "cookiesEnabled",
+                    "value": "true",
+                    "url": crawlerRunConfig.url
+                    if crawlerRunConfig and crawlerRunConfig.url
+                    else "https://crawl4ai.com/",
+                }
+            ]
+        )
+
+        # Handle navigator overrides
+        if crawlerRunConfig:
+            if (
+                crawlerRunConfig.override_navigator
+                or crawlerRunConfig.simulate_user
+                or crawlerRunConfig.magic
+            ):
+                await context.add_init_script(load_js_script("navigator_overrider"))        
+
+    async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
+        """
+        Creates and returns a new browser context with configured settings.
+        Applies text-only mode settings if text_mode is enabled in config.
+
+        Returns:
+            Context: Browser context object with the specified configurations
+        """
+        # Base settings
+        user_agent = self.config.headers.get("User-Agent", self.config.user_agent) 
+        viewport_settings = {
+            "width": self.config.viewport_width,
+            "height": self.config.viewport_height,
+        }
+        proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
+
+        blocked_extensions = [
+            # Images
+            "jpg",
+            "jpeg",
+            "png",
+            "gif",
+            "webp",
+            "svg",
+            "ico",
+            "bmp",
+            "tiff",
+            "psd",
+            # Fonts
+            "woff",
+            "woff2",
+            "ttf",
+            "otf",
+            "eot",
+            # Styles
+            # 'css', 'less', 'scss', 'sass',
+            # Media
+            "mp4",
+            "webm",
+            "ogg",
+            "avi",
+            "mov",
+            "wmv",
+            "flv",
+            "m4v",
+            "mp3",
+            "wav",
+            "aac",
+            "m4a",
+            "opus",
+            "flac",
+            # Documents
+            "pdf",
+            "doc",
+            "docx",
+            "xls",
+            "xlsx",
+            "ppt",
+            "pptx",
+            # Archives
+            "zip",
+            "rar",
+            "7z",
+            "tar",
+            "gz",
+            # Scripts and data
+            "xml",
+            "swf",
+            "wasm",
+        ]
+
+        # Common context settings
+        context_settings = {
+            "user_agent": user_agent,
+            "viewport": viewport_settings,
+            "proxy": proxy_settings,
+            "accept_downloads": self.config.accept_downloads,
+            "storage_state": self.config.storage_state,
+            "ignore_https_errors": self.config.ignore_https_errors,
+            "device_scale_factor": 1.0,
+            "java_script_enabled": self.config.java_script_enabled,
+        }
+        
+        if crawlerRunConfig:
+            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
+            if crawlerRunConfig.proxy_config:
+                proxy_settings = {
+                    "server": crawlerRunConfig.proxy_config.server,
+                }
+                if crawlerRunConfig.proxy_config.username:
+                    proxy_settings.update({
+                        "username": crawlerRunConfig.proxy_config.username,
+                        "password": crawlerRunConfig.proxy_config.password,
+                    })
+                context_settings["proxy"] = proxy_settings
+
+        if self.config.text_mode:
+            text_mode_settings = {
+                "has_touch": False,
+                "is_mobile": False,
+            }
+            # Update context settings with text mode settings
+            context_settings.update(text_mode_settings)
+
+        # Create and return the context with all settings
+        context = await self.browser.new_context(**context_settings)
+
+        # Apply text mode settings if enabled
+        if self.config.text_mode:
+            # Create and apply route patterns for each extension
+            for ext in blocked_extensions:
+                await context.route(f"**/*.{ext}", lambda route: route.abort())
+        return context
+
+    def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
+        """
+        Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
+        then returns a hash of the sorted JSON. This yields a stable signature
+        that identifies configurations requiring a unique browser context.
+        """
+        import json
+
+        config_dict = crawlerRunConfig.__dict__.copy()
+        # Exclude items that do not affect browser-level setup.
+        # Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
+        ephemeral_keys = [
+            "session_id",
+            "js_code",
+            "scraping_strategy",
+            "extraction_strategy",
+            "chunking_strategy",
+            "cache_mode",
+            "content_filter",
+            "semaphore_count",
+            "url"
+        ]
+        for key in ephemeral_keys:
+            if key in config_dict:
+                del config_dict[key]
+        # Convert to canonical JSON string
+        signature_json = json.dumps(config_dict, sort_keys=True, default=str)
+
+        # Hash the JSON so we get a compact, unique string
+        signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
+        return signature_hash
+
+    async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
+        """
+        Get a page for the given session ID, creating a new one if needed.
+
+        Args:
+            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
+
+        Returns:
+            (page, context): The Page and its BrowserContext
+        """
+        self._cleanup_expired_sessions()
+
+        # If a session_id is provided and we already have it, reuse that page + context
+        if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
+            context, page, _ = self.sessions[crawlerRunConfig.session_id]
+            # Update last-used timestamp
+            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
+            return page, context
+
+        # If using a managed browser, just grab the shared default_context
+        if self.config.use_managed_browser:
+            context = self.default_context
+            pages = context.pages
+            page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
+            if not page:
+                page = await context.new_page()
+        else:
+            # Otherwise, check if we have an existing context for this config
+            config_signature = self._make_config_signature(crawlerRunConfig)
+
+            async with self._contexts_lock:
+                if config_signature in self.contexts_by_config:
+                    context = self.contexts_by_config[config_signature]
+                else:
+                    # Create and setup a new context
+                    context = await self.create_browser_context(crawlerRunConfig)
+                    await self.setup_context(context, crawlerRunConfig)
+                    self.contexts_by_config[config_signature] = context
+
+            # Create a new page from the chosen context
+            page = await context.new_page()
+
+        # If a session_id is specified, store this session so we can reuse later
+        if crawlerRunConfig.session_id:
+            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
+
+        return page, context
+
+    async def kill_session(self, session_id: str):
+        """
+        Kill a browser session and clean up resources.
+
+        Args:
+            session_id (str): The session ID to kill.
+        """
+        if session_id in self.sessions:
+            context, page, _ = self.sessions[session_id]
+            await page.close()
+            if not self.config.use_managed_browser:
+                await context.close()
+            del self.sessions[session_id]
+
+    def _cleanup_expired_sessions(self):
+        """Clean up expired sessions based on TTL."""
+        current_time = time.time()
+        expired_sessions = [
+            sid
+            for sid, (_, _, last_used) in self.sessions.items()
+            if current_time - last_used > self.session_ttl
+        ]
+        for sid in expired_sessions:
+            asyncio.create_task(self.kill_session(sid))
+
+    async def close(self):
+        """Close all browser resources and clean up."""
+        if self.config.cdp_url:
+            return
+        
+        if self.config.sleep_on_close:
+            await asyncio.sleep(0.5)
+
+        session_ids = list(self.sessions.keys())
+        for session_id in session_ids:
+            await self.kill_session(session_id)
+
+        # Now close all contexts we created. This reclaims memory from ephemeral contexts.
+        for ctx in self.contexts_by_config.values():
+            try:
+                await ctx.close()
+            except Exception as e:
+                self.logger.error(
+                    message="Error closing context: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)}
+                )
+        self.contexts_by_config.clear()
+
+        if self.browser:
+            await self.browser.close()
+            self.browser = None
+
+        if self.managed_browser:
+            await asyncio.sleep(0.5)
+            await self.managed_browser.cleanup()
+            self.managed_browser = None
+
+        if self.playwright:
+            await self.playwright.stop()
+            self.playwright = None
+
+```
+
+
+
+
+## File: docs/examples/quickstart.py
+
+```py
+import os, sys
+
+from crawl4ai import LLMConfig
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+
+import asyncio
+import time
+import json
+import re
+from typing import Dict
+from bs4 import BeautifulSoup
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.extraction_strategy import (
+    JsonCssExtractionStrategy,
+    LLMExtractionStrategy,
+)
+
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+print("Crawl4AI: Advanced Web Crawling and Data Extraction")
+print("GitHub Repository: https://github.com/unclecode/crawl4ai")
+print("Twitter: @unclecode")
+print("Website: https://crawl4ai.com")
+
+
+# Basic Example - Simple Crawl
+async def simple_crawl():
+    print("\n--- Basic Usage ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+async def clean_content():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        excluded_tags=["nav", "footer", "aside"],
+        remove_overlay_elements=True,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, threshold_type="fixed", min_word_threshold=0
+            ),
+            options={"ignore_links": True},
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/Apple",
+            config=crawler_config,
+        )
+        full_markdown_length = len(result.markdown.raw_markdown)
+        fit_markdown_length = len(result.markdown.fit_markdown)
+        print(f"Full Markdown Length: {full_markdown_length}")
+        print(f"Fit Markdown Length: {fit_markdown_length}")
+
+
+async def link_analysis():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,
+        exclude_external_links=True,
+        exclude_social_media_links=True,
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config,
+        )
+        print(f"Found {len(result.links['internal'])} internal links")
+        print(f"Found {len(result.links['external'])} external links")
+
+        for link in result.links["internal"][:5]:
+            print(f"Href: {link['href']}\nText: {link['text']}\n")
+
+
+# JavaScript Execution Example
+async def simple_example_with_running_js_code():
+    print("\n--- Executing JavaScript and Using CSS Selectors ---")
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
+        # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+# CSS Selector Example
+async def simple_example_with_css_selector():
+    print("\n--- Using CSS Selectors ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+async def media_handling():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        for img in result.media["images"][:5]:
+            print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
+
+
+async def custom_hook_workflow(verbose=True):
+    async with AsyncWebCrawler() as crawler:
+        # Set a 'before_goto' hook to run custom code just before navigation
+        crawler.crawler_strategy.set_hook(
+            "before_goto",
+            lambda page, context: print("[Hook] Preparing to navigate..."),
+        )
+
+        # Perform the crawl operation
+        result = await crawler.arun(url="https://crawl4ai.com")
+        print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
+
+
+# Proxy Example
+async def use_proxy():
+    print("\n--- Using a Proxy ---")
+    browser_config = BrowserConfig(
+        headless=True,
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "username",
+            "password": "password",
+        },
+    )
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        if result.success:
+            print(result.markdown[:500])
+
+
+# Screenshot Example
+async def capture_and_save_screenshot(url: str, output_path: str):
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url=url, config=crawler_config)
+
+        if result.success and result.screenshot:
+            import base64
+
+            screenshot_data = base64.b64decode(result.screenshot)
+            with open(output_path, "wb") as f:
+                f.write(screenshot_data)
+            print(f"Screenshot saved successfully to {output_path}")
+        else:
+            print("Failed to capture screenshot")
+
+
+# LLM Extraction Example
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )
+
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
+        return
+
+    browser_config = BrowserConfig(headless=True)
+
+    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+    if extra_headers:
+        extra_args["extra_headers"] = extra_headers
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=1,
+        page_timeout=80000,
+        extraction_strategy=LLMExtractionStrategy(
+            llm_config=LLMConfig(provider=provider,api_token=api_token),
+            schema=OpenAIModelFee.model_json_schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content.""",
+            extra_args=extra_args,
+        ),
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://openai.com/api/pricing/", config=crawler_config
+        )
+        print(result.extracted_content)
+
+
+# CSS Extraction Example
+async def extract_structured_data_using_css_extractor():
+    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+    schema = {
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    js_click_tabs = """
+    (async () => {
+        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+        for(let tab of tabs) {
+            tab.scrollIntoView();
+            tab.click();
+            await new Promise(r => setTimeout(r, 500));
+        }
+    })();
+    """
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        js_code=[js_click_tabs],
+        delay_before_return_html=1
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.kidocode.com/degrees/technology", config=crawler_config
+        )
+
+        companies = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(companies)} companies")
+        print(json.dumps(companies[0], indent=2))
+
+
+# Dynamic Content Examples - Method 1
+async def crawl_dynamic_content_pages_method_1():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+    first_commit = ""
+
+    async def on_execution_started(page, **kwargs):
+        nonlocal first_commit
+        try:
+            while True:
+                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await commit.evaluate("(element) => element.textContent")
+                commit = re.sub(r"\s+", "", commit)
+                if commit and commit != first_commit:
+                    first_commit = commit
+                    break
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            print(f"Warning: New content didn't appear after JavaScript execution: {e}")
+
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        js_next_page = """
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+        """
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                js_code=js_next_page if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id,
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            soup = BeautifulSoup(result.cleaned_html, "html.parser")
+            commits = soup.select("li")
+            all_commits.extend(commits)
+
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+# Dynamic Content Examples - Method 2
+async def crawl_dynamic_content_pages_method_2():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+    js_next_page_and_wait = """
+    (async () => {
+        const getCurrentCommit = () => {
+            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+            return commits.length > 0 ? commits[0].textContent.trim() : null;
+        };
+
+        const initialCommit = getCurrentCommit();
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+
+        while (true) {
+            await new Promise(resolve => setTimeout(resolve, 100));
+            const newCommit = getCurrentCommit();
+            if (newCommit && newCommit !== initialCommit) {
+                break;
+            }
+        }
+    })();
+    """
+
+    schema = {
+        "name": "Commit Extractor",
+        "baseSelector": "li.Box-sc-g0xbh4-0",
+        "fields": [
+            {
+                "name": "title",
+                "selector": "h4.markdown-title",
+                "type": "text",
+                "transform": "strip",
+            },
+        ],
+    }
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        extraction_strategy = JsonCssExtractionStrategy(schema)
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                extraction_strategy=extraction_strategy,
+                js_code=js_next_page_and_wait if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id,
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            commits = json.loads(result.extracted_content)
+            all_commits.extend(commits)
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+async def cosine_similarity_extraction():
+    from crawl4ai.extraction_strategy import CosineStrategy
+    crawl_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=CosineStrategy(
+            word_count_threshold=10,
+            max_dist=0.2,  # Maximum distance between two words
+            linkage_method="ward",  # Linkage method for hierarchical clustering (ward, complete, average, single)
+            top_k=3,  # Number of top keywords to extract
+            sim_threshold=0.3,  # Similarity threshold for clustering
+            semantic_filter="McDonald's economic impact, American consumer trends",  # Keywords to filter the content semantically using embeddings
+            verbose=True,
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
+            config=crawl_config,
+        )
+        print(json.loads(result.extracted_content)[:5])
+
+
+# Browser Comparison
+async def crawl_custom_browser_type():
+    print("\n--- Browser Comparison ---")
+
+    # Firefox
+    browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("Firefox:", time.time() - start)
+        print(result.markdown[:500])
+
+    # WebKit
+    browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("WebKit:", time.time() - start)
+        print(result.markdown[:500])
+
+    # Chromium (default)
+    browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("Chromium:", time.time() - start)
+        print(result.markdown[:500])
+
+
+# Anti-Bot and User Simulation
+async def crawl_with_user_simulation():
+    browser_config = BrowserConfig(
+        headless=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
+    )
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        magic=True,
+        simulate_user=True,
+        override_navigator=True,
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
+        print(result.markdown)
+
+
+async def ssl_certification():
+    # Configure crawler to fetch SSL certificate
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS,  # Bypass cache to always get fresh certificates
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=config)
+
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+
+            tmp_dir = os.path.join(__location__, "tmp")
+            os.makedirs(tmp_dir, exist_ok=True)
+
+            # 1. Access certificate properties directly
+            print("\nCertificate Information:")
+            print(f"Issuer: {cert.issuer.get('CN', '')}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+
+            # 2. Export certificate in different formats
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
+            print("\nCertificate exported to:")
+            print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
+
+            pem_data = cert.to_pem(
+                os.path.join(tmp_dir, "certificate.pem")
+            )  # For web servers
+            print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
+
+            der_data = cert.to_der(
+                os.path.join(tmp_dir, "certificate.der")
+            )  # For Java apps
+            print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
+
+
+# Main execution
+async def main():
+    # Basic examples
+    await simple_crawl()
+    await simple_example_with_running_js_code()
+    await simple_example_with_css_selector()
+
+    # Advanced examples
+    await extract_structured_data_using_css_extractor()
+    await extract_structured_data_using_llm(
+        "openai/gpt-4o", os.getenv("OPENAI_API_KEY")
+    )
+    await crawl_dynamic_content_pages_method_1()
+    await crawl_dynamic_content_pages_method_2()
+
+    # Browser comparisons
+    await crawl_custom_browser_type()
+
+    # Screenshot example
+    await capture_and_save_screenshot(
+        "https://www.example.com",
+        os.path.join(__location__, "tmp/example_screenshot.jpg")
+    )
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+
+## File: docs/examples/quickstart_examples_set_1.py
+
+```py
+import asyncio
+import os
+import json
+import base64
+from pathlib import Path
+from typing import List
+from crawl4ai import ProxyConfig
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
+from crawl4ai import RoundRobinProxyStrategy
+from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
+from crawl4ai import LLMConfig
+from crawl4ai import PruningContentFilter, BM25ContentFilter
+from crawl4ai import DefaultMarkdownGenerator
+from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
+from crawl4ai import BrowserConfig
+
+__cur_dir__ = Path(__file__).parent
+
+async def demo_basic_crawl():
+    """Basic web crawling with markdown generation"""
+    print("\n=== 1. Basic Web Crawling ===")
+    async with AsyncWebCrawler(config = BrowserConfig(
+        viewport_height=800,
+        viewport_width=1200,
+        headless=True,
+        verbose=True,
+    )) as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com/"
+        )
+
+        for i, result in enumerate(results):
+            print(f"Result {i + 1}:")
+            print(f"Success: {result.success}")
+            if result.success:
+                print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
+                print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")
+            else:
+                print("Failed to crawl the URL")
+
+async def demo_parallel_crawl():
+    """Crawl multiple URLs in parallel"""
+    print("\n=== 2. Parallel Crawling ===")
+
+    urls = [
+        "https://news.ycombinator.com/",
+        "https://example.com/",
+        "https://httpbin.org/html",
+    ]
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun_many(
+            urls=urls,
+        )
+
+        print(f"Crawled {len(results)} URLs in parallel:")
+        for i, result in enumerate(results):
+            print(
+                f"  {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
+            )
+
+async def demo_fit_markdown():
+    """Generate focused markdown with LLM content filter"""
+    print("\n=== 3. Fit Markdown with LLM Content Filter ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: CrawlResult = await crawler.arun(
+            url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
+            config=CrawlerRunConfig(
+                markdown_generator=DefaultMarkdownGenerator(
+                    content_filter=PruningContentFilter()
+                )
+            ),
+        )
+
+        # Print stats and save the fit markdown
+        print(f"Raw: {len(result.markdown.raw_markdown)} chars")
+        print(f"Fit: {len(result.markdown.fit_markdown)} chars")
+
+async def demo_llm_structured_extraction_no_schema():
+    # Create a simple LLM extraction strategy (no schema required)
+    extraction_strategy = LLMExtractionStrategy(
+        llm_config=LLMConfig(
+            provider="groq/qwen-2.5-32b",
+            api_token="env:GROQ_API_KEY",
+        ),
+        instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
+        extract_type="schema",
+        schema="{title: string, url: string, comments: int}",
+        extra_args={
+            "temperature": 0.0,
+            "max_tokens": 4096,
+        },
+        verbose=True,
+    )
+
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            "https://news.ycombinator.com/", config=config
+        )
+
+        for result in results:
+            print(f"URL: {result.url}")
+            print(f"Success: {result.success}")
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+async def demo_css_structured_extraction_no_schema():
+    """Extract structured data using CSS selectors"""
+    print("\n=== 5. CSS-Based Structured Extraction ===")
+    # Sample HTML for schema generation (one-time cost)
+    sample_html = """
+<div class="body-post clear">
+    <a class="story-link" href="https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html">
+        <div class="clear home-post-box cf">
+            <div class="home-img clear">
+                <div class="img-ratio">
+                    <img alt="..." src="...">
+                </div>
+            </div>
+            <div class="clear home-right">
+                <h2 class="home-title">Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data</h2>
+                <div class="item-label">
+                    <span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
+                    <span class="h-tags">Malware / Supply Chain Attack</span>
+                </div>
+                <div class="home-desc"> Cybersecurity researchers have...</div>
+            </div>
+        </div>
+    </a>
+</div>
+    """
+
+    # Check if schema file exists
+    schema_file_path = f"{__cur_dir__}/tmp/schema.json"
+    if os.path.exists(schema_file_path):
+        with open(schema_file_path, "r") as f:
+            schema = json.load(f)
+    else:
+        # Generate schema using LLM (one-time setup)
+        schema = JsonCssExtractionStrategy.generate_schema(
+            html=sample_html,
+            llm_config=LLMConfig(
+                provider="groq/qwen-2.5-32b",
+                api_token="env:GROQ_API_KEY",
+            ),
+            query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
+        )
+
+    print(f"Generated schema: {json.dumps(schema, indent=2)}")
+    # Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
+    with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
+        json.dump(schema, f, indent=2)
+
+    # Create no-LLM extraction strategy with the generated schema
+    extraction_strategy = JsonCssExtractionStrategy(schema)
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+    # Use the fast CSS extraction (no LLM calls during extraction)
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            "https://thehackernews.com", config=config
+        )
+
+        for result in results:
+            print(f"URL: {result.url}")
+            print(f"Success: {result.success}")
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+async def demo_deep_crawl():
+    """Deep crawling with BFS strategy"""
+    print("\n=== 6. Deep Crawling ===")
+
+    filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])
+
+    deep_crawl_strategy = BFSDeepCrawlStrategy(
+        max_depth=1, max_pages=5, filter_chain=filter_chain
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://docs.crawl4ai.com",
+            config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
+        )
+
+        print(f"Deep crawl returned {len(results)} pages:")
+        for i, result in enumerate(results):
+            depth = result.metadata.get("depth", "unknown")
+            print(f"  {i + 1}. {result.url} (Depth: {depth})")
+
+async def demo_js_interaction():
+    """Execute JavaScript to load more content"""
+    print("\n=== 7. JavaScript Interaction ===")
+
+    # A simple page that needs JS to reveal content
+    async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
+        # Initial load
+
+        news_schema = {
+            "name": "news",
+            "baseSelector": "tr.athing",
+            "fields": [
+                {
+                    "name": "title",
+                    "selector": "span.titleline",
+                    "type": "text",
+                }
+            ],
+        }
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=CrawlerRunConfig(
+                session_id="hn_session",  # Keep session
+                extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),
+            ),
+        )
+
+        news = []
+        for result in results:
+            if result.success:
+                data = json.loads(result.extracted_content)
+                news.extend(data)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+        print(f"Initial items: {len(news)}")
+
+        # Click "More" link
+        more_config = CrawlerRunConfig(
+            js_code="document.querySelector('a.morelink').click();",
+            js_only=True,  # Continue in same page
+            session_id="hn_session",  # Keep session
+            extraction_strategy=JsonCssExtractionStrategy(
+                schema=news_schema,
+            ),
+        )
+
+        result: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com", config=more_config
+        )
+
+        # Extract new items
+        for result in results:
+            if result.success:
+                data = json.loads(result.extracted_content)
+                news.extend(data)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+        print(f"Total items: {len(news)}")
+
+async def demo_media_and_links():
+    """Extract media and links from a page"""
+    print("\n=== 8. Media and Links Extraction ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")
+
+        for i, result in enumerate(result):
+            # Extract and save all images
+            images = result.media.get("images", [])
+            print(f"Found {len(images)} images")
+
+            # Extract and save all links (internal and external)
+            internal_links = result.links.get("internal", [])
+            external_links = result.links.get("external", [])
+            print(f"Found {len(internal_links)} internal links")
+            print(f"Found {len(external_links)} external links")
+
+            # Print some of the images and links
+            for image in images[:3]:
+                print(f"Image: {image['src']}")
+            for link in internal_links[:3]:
+                print(f"Internal link: {link['href']}")
+            for link in external_links[:3]:
+                print(f"External link: {link['href']}")
+
+            # # Save everything to files
+            with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
+                json.dump(images, f, indent=2)
+
+            with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
+                json.dump(
+                    {"internal": internal_links, "external": external_links},
+                    f,
+                    indent=2,
+                )
+
+async def demo_screenshot_and_pdf():
+    """Capture screenshot and PDF of a page"""
+    print("\n=== 9. Screenshot and PDF Capture ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: List[CrawlResult] = await crawler.arun(
+            # url="https://example.com",
+            url="https://en.wikipedia.org/wiki/Giant_anteater",
+            config=CrawlerRunConfig(screenshot=True, pdf=True),
+        )
+
+        for i, result in enumerate(result):
+            # if result.screenshot_data:
+            if result.screenshot:
+                # Save screenshot
+                screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
+                with open(screenshot_path, "wb") as f:
+                    f.write(base64.b64decode(result.screenshot))
+                print(f"Screenshot saved to {screenshot_path}")
+
+            # if result.pdf_data:
+            if result.pdf:
+                # Save PDF
+                pdf_path = f"{__cur_dir__}/tmp/example.pdf"
+                with open(pdf_path, "wb") as f:
+                    f.write(result.pdf)
+                print(f"PDF saved to {pdf_path}")
+
+async def demo_proxy_rotation():
+    """Proxy rotation for multiple requests"""
+    print("\n=== 10. Proxy Rotation ===")
+
+    # Example proxies (replace with real ones)
+    proxies = [
+        ProxyConfig(server="http://proxy1.example.com:8080"),
+        ProxyConfig(server="http://proxy2.example.com:8080"),
+    ]
+
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+
+    print(f"Using {len(proxies)} proxies in rotation")
+    print(
+        "Note: This example uses placeholder proxies - replace with real ones to test"
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            proxy_rotation_strategy=proxy_strategy
+        )
+
+        # In a real scenario, these would be run and the proxies would rotate
+        print("In a real scenario, requests would rotate through the available proxies")
+
+async def demo_raw_html_and_file():
+    """Process raw HTML and local files"""
+    print("\n=== 11. Raw HTML and Local Files ===")
+
+    raw_html = """
+    <html><body>
+        <h1>Sample Article</h1>
+        <p>This is sample content for testing Crawl4AI's raw HTML processing.</p>
+    </body></html>
+    """
+
+    # Save to file
+    file_path = Path("docs/examples/tmp/sample.html").absolute()
+    with open(file_path, "w") as f:
+        f.write(raw_html)
+
+    async with AsyncWebCrawler() as crawler:
+        # Crawl raw HTML
+        raw_result = await crawler.arun(
+            url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        print("Raw HTML processing:")
+        print(f"  Markdown: {raw_result.markdown.raw_markdown[:50]}...")
+
+        # Crawl local file
+        file_result = await crawler.arun(
+            url=f"file://{file_path}",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("\nLocal file processing:")
+        print(f"  Markdown: {file_result.markdown.raw_markdown[:50]}...")
+
+    # Clean up
+    os.remove(file_path)
+    print(f"Processed both raw HTML and local file ({file_path})")
+
+async def main():
+    """Run all demo functions sequentially"""
+    print("=== Comprehensive Crawl4AI Demo ===")
+    print("Note: Some examples require API keys or other configurations")
+
+    # Run all demos
+    await demo_basic_crawl()
+    await demo_parallel_crawl()
+    await demo_fit_markdown()
+    await demo_llm_structured_extraction_no_schema()
+    await demo_css_structured_extraction_no_schema()
+    await demo_deep_crawl()
+    await demo_js_interaction()
+    await demo_media_and_links()
+    await demo_screenshot_and_pdf()
+    # # await demo_proxy_rotation()
+    await demo_raw_html_and_file()
+
+    # Clean up any temp files that may have been created
+    print("\n=== Demo Complete ===")
+    print("Check for any generated files (screenshots, PDFs) in the current directory")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+
+
+
+## File: docs/examples/dispatcher_example.py
+
+```py
+import asyncio
+import time
+from rich import print
+from rich.table import Table
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    MemoryAdaptiveDispatcher,
+    SemaphoreDispatcher,
+    RateLimiter,
+    CrawlerMonitor,
+    DisplayMode,
+    CacheMode,
+    LXMLWebScrapingStrategy,
+)
+
+
+async def memory_adaptive(urls, browser_config, run_config):
+    """Memory adaptive crawler with monitoring"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=70.0,
+            max_session_permit=10,
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
+    """Memory adaptive crawler with rate limiting"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=95.0,
+            max_session_permit=10,
+            rate_limiter=RateLimiter(
+                base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
+            ),
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+async def semaphore(urls, browser_config, run_config):
+    """Basic semaphore crawler"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = SemaphoreDispatcher(
+            semaphore_count=5,
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+async def semaphore_with_rate_limit(urls, browser_config, run_config):
+    """Semaphore crawler with rate limiting"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = SemaphoreDispatcher(
+            semaphore_count=5,
+            rate_limiter=RateLimiter(
+                base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
+            ),
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+def create_performance_table(results):
+    """Creates a rich table showing performance results"""
+    table = Table(title="Crawler Strategy Performance Comparison")
+    table.add_column("Strategy", style="cyan")
+    table.add_column("URLs Crawled", justify="right", style="green")
+    table.add_column("Time (seconds)", justify="right", style="yellow")
+    table.add_column("URLs/second", justify="right", style="magenta")
+
+    sorted_results = sorted(results.items(), key=lambda x: x[1][1])
+
+    for strategy, (urls_crawled, duration) in sorted_results:
+        urls_per_second = urls_crawled / duration
+        table.add_row(
+            strategy, str(urls_crawled), f"{duration:.2f}", f"{urls_per_second:.2f}"
+        )
+
+    return table
+
+
+async def main():
+    urls = [f"https://example.com/page{i}" for i in range(1, 40)]
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, scraping_strategy=LXMLWebScrapingStrategy())
+
+    results = {
+        "Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
+        # "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(
+        #     urls, browser_config, run_config
+        # ),
+        # "Semaphore": await semaphore(urls, browser_config, run_config),
+        # "Semaphore + Rate Limit": await semaphore_with_rate_limit(
+        #     urls, browser_config, run_config
+        # ),
+    }
+
+    table = create_performance_table(results)
+    print("\nPerformance Summary:")
+    print(table)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+
+## File: docs/examples/hello_world.py
+
+```py
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+    CrawlResult
+)
+
+async def example_cdp():
+    browser_conf = BrowserConfig(
+        headless=False,
+        cdp_url="http://localhost:9223"
+    )
+    crawler_config = CrawlerRunConfig(
+        session_id="test",
+        js_code = """(() => { return {"result": "Hello World!"} })()""",
+        js_only=True
+    )
+    async with AsyncWebCrawler(
+        config=browser_conf,
+        verbose=True,
+    ) as crawler:
+        result : CrawlResult = await crawler.arun(
+            url="https://www.helloworld.org",
+            config=crawler_config,
+        )
+        print(result.js_execution_result)
+                   
+
+async def main():
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(
+                     threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                )
+            ),
+        )
+        result : CrawlResult = await crawler.arun(
+            url="https://www.helloworld.org", config=crawler_config
+        )
+        print(result.markdown.raw_markdown[:500])
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+
+## File: docs/examples/hooks_example.py
+
+```py
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from playwright.async_api import Page, BrowserContext
+
+
+async def main():
+    print("🔗 Hooks Example: Demonstrating different hook use cases")
+
+    # Configure browser settings
+    browser_config = BrowserConfig(headless=True)
+
+    # Configure crawler settings
+    crawler_run_config = CrawlerRunConfig(
+        js_code="window.scrollTo(0, document.body.scrollHeight);",
+        wait_for="body",
+        cache_mode=CacheMode.BYPASS,
+    )
+
+    # Create crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    # Define and set hook functions
+    async def on_browser_created(browser, context: BrowserContext, **kwargs):
+        """Hook called after the browser is created"""
+        print("[HOOK] on_browser_created - Browser is ready!")
+        # Example: Set a cookie that will be used for all requests
+        return browser
+
+    async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
+        """Hook called after a new page and context are created"""
+        print("[HOOK] on_page_context_created - New page created!")
+        # Example: Set default viewport size
+        await context.add_cookies(
+            [
+                {
+                    "name": "session_id",
+                    "value": "example_session",
+                    "domain": ".example.com",
+                    "path": "/",
+                }
+            ]
+        )
+        await page.set_viewport_size({"width": 1080, "height": 800})
+        return page
+
+    async def on_user_agent_updated(
+        page: Page, context: BrowserContext, user_agent: str, **kwargs
+    ):
+        """Hook called when the user agent is updated"""
+        print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
+        return page
+
+    async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
+        """Hook called after custom JavaScript execution"""
+        print("[HOOK] on_execution_started - Custom JS executed!")
+        return page
+
+    async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):
+        """Hook called before navigating to each URL"""
+        print(f"[HOOK] before_goto - About to visit: {url}")
+        # Example: Add custom headers for the request
+        await page.set_extra_http_headers({"Custom-Header": "my-value"})
+        return page
+
+    async def after_goto(
+        page: Page, context: BrowserContext, url: str, response: dict, **kwargs
+    ):
+        """Hook called after navigating to each URL"""
+        print(f"[HOOK] after_goto - Successfully loaded: {url}")
+        # Example: Wait for a specific element to be loaded
+        try:
+            await page.wait_for_selector(".content", timeout=1000)
+            print("Content element found!")
+        except:
+            print("Content element not found, continuing anyway")
+        return page
+
+    async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
+        """Hook called before retrieving the HTML content"""
+        print("[HOOK] before_retrieve_html - About to get HTML content")
+        # Example: Scroll to bottom to trigger lazy loading
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+        return page
+
+    async def before_return_html(
+        page: Page, context: BrowserContext, html: str, **kwargs
+    ):
+        """Hook called before returning the HTML content"""
+        print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
+        # Example: You could modify the HTML content here if needed
+        return page
+
+    # Set all the hooks
+    crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
+    crawler.crawler_strategy.set_hook(
+        "on_page_context_created", on_page_context_created
+    )
+    crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
+    crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+    crawler.crawler_strategy.set_hook("before_goto", before_goto)
+    crawler.crawler_strategy.set_hook("after_goto", after_goto)
+    crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
+    crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
+
+    await crawler.start()
+
+    # Example usage: crawl a simple website
+    url = "https://example.com"
+    result = await crawler.arun(url, config=crawler_run_config)
+    print(f"\nCrawled URL: {result.url}")
+    print(f"HTML length: {len(result.html)}")
+
+    await crawler.close()
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())
+
+```
+
+
+
+## File: crawl4ai/deep_crawling/__init__.py
+
+```py
+# deep_crawling/__init__.py
+from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
+from .bfs_strategy import BFSDeepCrawlStrategy
+from .bff_strategy import BestFirstCrawlingStrategy
+from .dfs_strategy import DFSDeepCrawlStrategy
+from .filters import (
+    FilterChain,
+    ContentTypeFilter,
+    DomainFilter,
+    URLFilter,
+    URLPatternFilter,
+    FilterStats,
+    ContentRelevanceFilter,
+    SEOFilter
+)
+from .scorers import (
+    KeywordRelevanceScorer,
+    URLScorer,
+    CompositeScorer,
+    DomainAuthorityScorer,
+    FreshnessScorer,
+    PathDepthScorer,
+    ContentTypeScorer
+)
+
+__all__ = [
+    "DeepCrawlDecorator",
+    "DeepCrawlStrategy",
+    "BFSDeepCrawlStrategy",
+    "BestFirstCrawlingStrategy",
+    "DFSDeepCrawlStrategy",
+    "FilterChain",
+    "ContentTypeFilter",
+    "DomainFilter",
+    "URLFilter",
+    "URLPatternFilter",
+    "FilterStats",
+    "ContentRelevanceFilter",
+    "SEOFilter",
+    "KeywordRelevanceScorer",
+    "URLScorer",
+    "CompositeScorer",
+    "DomainAuthorityScorer",
+    "FreshnessScorer",
+    "PathDepthScorer",
+    "ContentTypeScorer",
+]
+
+```
+
+
+## File: crawl4ai/deep_crawling/base_strategy.py
+
+```py
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Optional, Set, List, Dict
+from functools import wraps
+from contextvars import ContextVar
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
+
+
+class DeepCrawlDecorator:
+    """Decorator that adds deep crawling capability to arun method."""
+    deep_crawl_active = ContextVar("deep_crawl_active", default=False)
+    
+    def __init__(self, crawler: AsyncWebCrawler): 
+        self.crawler = crawler
+
+    def __call__(self, original_arun):
+        @wraps(original_arun)
+        async def wrapped_arun(url: str, config: CrawlerRunConfig = None, **kwargs):
+            # If deep crawling is already active, call the original method to avoid recursion.
+            if config and config.deep_crawl_strategy and not self.deep_crawl_active.get():
+                token = self.deep_crawl_active.set(True)
+                # Await the arun call to get the actual result object.
+                result_obj = await config.deep_crawl_strategy.arun(
+                    crawler=self.crawler,
+                    start_url=url,
+                    config=config
+                )
+                if config.stream:
+                    async def result_wrapper():
+                        try:
+                            async for result in result_obj:
+                                yield result
+                        finally:
+                            self.deep_crawl_active.reset(token)
+                    return result_wrapper()
+                else:
+                    try:
+                        return result_obj
+                    finally:
+                        self.deep_crawl_active.reset(token)
+            return await original_arun(url, config=config, **kwargs)
+        return wrapped_arun
+
+class DeepCrawlStrategy(ABC):
+    """
+    Abstract base class for deep crawling strategies.
+    
+    Core functions:
+      - arun: Main entry point that returns an async generator of CrawlResults.
+      - shutdown: Clean up resources.
+      - can_process_url: Validate a URL and decide whether to process it.
+      - _process_links: Extract and process links from a CrawlResult.
+    """
+
+    @abstractmethod
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Batch (non-streaming) mode:
+        Processes one BFS level at a time, then yields all the results.
+        """
+        pass
+
+    @abstractmethod
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Streaming mode:
+        Processes one BFS level at a time and yields results immediately as they arrive.
+        """
+        pass
+    
+    async def arun(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: Optional[CrawlerRunConfig] = None,
+    ) -> RunManyReturn:
+        """
+        Traverse the given URL using the specified crawler.
+        
+        Args:
+            start_url (str): The URL from which to start crawling.
+            crawler (AsyncWebCrawler): The crawler instance to use.
+            crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration.
+        
+        Returns:
+            Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
+        """
+        if config is None:
+            raise ValueError("CrawlerRunConfig must be provided")
+
+        if config.stream:
+            return self._arun_stream(start_url, crawler, config)
+        else:
+            return await self._arun_batch(start_url, crawler, config)
+
+    def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig):
+        return self.arun(start_url, crawler, config)
+
+    @abstractmethod
+    async def shutdown(self) -> None:
+        """
+        Clean up resources used by the deep crawl strategy.
+        """
+        pass
+
+    @abstractmethod
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validate the URL format and apply custom filtering logic.
+        
+        Args:
+            url (str): The URL to validate.
+            depth (int): The current depth in the crawl.
+        
+        Returns:
+            bool: True if the URL should be processed, False otherwise.
+        """
+        pass
+
+    @abstractmethod
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_level: List[tuple],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extract and process links from the given crawl result.
+        
+        This method should:
+          - Validate each extracted URL using can_process_url.
+          - Optionally score URLs.
+          - Append valid URLs (and their parent references) to the next_level list.
+          - Update the depths dictionary with the new depth for each URL.
+        
+        Args:
+            result (CrawlResult): The result from a crawl operation.
+            source_url (str): The URL from which this result was obtained.
+            current_depth (int): The depth at which the source URL was processed.
+            visited (Set[str]): Set of already visited URLs.
+            next_level (List[tuple]): List of tuples (url, parent_url) for the next BFS level.
+            depths (Dict[str, int]): Mapping of URLs to their current depth.
+        """
+        pass
+
+
+```
+
+
+## File: crawl4ai/deep_crawling/bff_strategy.py
+
+```py
+# best_first_crawling_strategy.py
+import asyncio
+import logging
+from datetime import datetime
+from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
+from urllib.parse import urlparse
+
+from ..models import TraversalStats
+from .filters import FilterChain
+from .scorers import URLScorer
+from . import DeepCrawlStrategy
+
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
+
+from math import inf as infinity
+
+# Configurable batch size for processing items from the priority queue
+BATCH_SIZE = 10
+
+
+class BestFirstCrawlingStrategy(DeepCrawlStrategy):
+    """
+    Best-First Crawling Strategy using a priority queue.
+    
+    This strategy prioritizes URLs based on their score, ensuring that higher-value
+    pages are crawled first. It reimplements the core traversal loop to use a priority
+    queue while keeping URL validation and link discovery consistent with our design.
+    
+    Core methods:
+      - arun: Returns either a list (batch mode) or an async generator (stream mode).
+      - _arun_best_first: Core generator that uses a priority queue to yield CrawlResults.
+      - can_process_url: Validates URLs and applies filtering (inherited behavior).
+      - link_discovery: Extracts and validates links from a CrawlResult.
+    """
+    def __init__(
+        self,
+        max_depth: int,
+        filter_chain: FilterChain = FilterChain(),
+        url_scorer: Optional[URLScorer] = None,
+        include_external: bool = False,
+        max_pages: int = infinity,
+        logger: Optional[logging.Logger] = None,
+    ):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.url_scorer = url_scorer
+        self.include_external = include_external
+        self.max_pages = max_pages
+        self.logger = logger or logging.getLogger(__name__)
+        self.stats = TraversalStats(start_time=datetime.now())
+        self._cancel_event = asyncio.Event()
+        self._pages_crawled = 0
+
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validate the URL format and apply filtering.
+        For the starting URL (depth 0), filtering is bypassed.
+        """
+        try:
+            parsed = urlparse(url)
+            if not parsed.scheme or not parsed.netloc:
+                raise ValueError("Missing scheme or netloc")
+            if parsed.scheme not in ("http", "https"):
+                raise ValueError("Invalid scheme")
+            if "." not in parsed.netloc:
+                raise ValueError("Invalid domain")
+        except Exception as e:
+            self.logger.warning(f"Invalid URL: {url}, error: {e}")
+            return False
+
+        if depth != 0 and not await self.filter_chain.apply(url):
+            return False
+
+        return True
+
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_links: List[Tuple[str, Optional[str]]],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extract links from the crawl result, validate them, and append new URLs
+        (with their parent references) to next_links.
+        Also updates the depths dictionary.
+        """
+        new_depth = current_depth + 1
+        if new_depth > self.max_depth:
+            return
+            
+        # If we've reached the max pages limit, don't discover new links
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery")
+            return
+
+        # Retrieve internal links; include external links if enabled.
+        links = result.links.get("internal", [])
+        if self.include_external:
+            links += result.links.get("external", [])
+
+        # If we have more links than remaining capacity, limit how many we'll process
+        valid_links = []
+        for link in links:
+            url = link.get("href")
+            if url in visited:
+                continue
+            if not await self.can_process_url(url, new_depth):
+                self.stats.urls_skipped += 1
+                continue
+                
+            valid_links.append(url)
+            
+        # If we have more valid links than capacity, limit them
+        if len(valid_links) > remaining_capacity:
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
+            
+        # Record the new depths and add to next_links
+        for url in valid_links:
+            depths[url] = new_depth
+            next_links.append((url, source_url))
+
+    async def _arun_best_first(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Core best-first crawl method using a priority queue.
+        
+        The queue items are tuples of (score, depth, url, parent_url). Lower scores
+        are treated as higher priority. URLs are processed in batches for efficiency.
+        """
+        queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
+        # Push the initial URL with score 0 and depth 0.
+        await queue.put((0, 0, start_url, None))
+        visited: Set[str] = set()
+        depths: Dict[str, int] = {start_url: 0}
+
+        while not queue.empty() and not self._cancel_event.is_set():
+            # Stop if we've reached the max pages limit
+            if self._pages_crawled >= self.max_pages:
+                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
+                break
+                
+            batch: List[Tuple[float, int, str, Optional[str]]] = []
+            # Retrieve up to BATCH_SIZE items from the priority queue.
+            for _ in range(BATCH_SIZE):
+                if queue.empty():
+                    break
+                item = await queue.get()
+                score, depth, url, parent_url = item
+                if url in visited:
+                    continue
+                visited.add(url)
+                batch.append(item)
+
+            if not batch:
+                continue
+
+            # Process the current batch of URLs.
+            urls = [item[2] for item in batch]
+            batch_config = config.clone(deep_crawl_strategy=None, stream=True)
+            stream_gen = await crawler.arun_many(urls=urls, config=batch_config)
+            async for result in stream_gen:
+                result_url = result.url
+                # Find the corresponding tuple from the batch.
+                corresponding = next((item for item in batch if item[2] == result_url), None)
+                if not corresponding:
+                    continue
+                score, depth, url, parent_url = corresponding
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                result.metadata["parent_url"] = parent_url
+                result.metadata["score"] = score
+                
+                # Count only successful crawls toward max_pages limit
+                if result.success:
+                    self._pages_crawled += 1
+                
+                yield result
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Discover new links from this result
+                    new_links: List[Tuple[str, Optional[str]]] = []
+                    await self.link_discovery(result, result_url, depth, visited, new_links, depths)
+                    
+                    for new_url, new_parent in new_links:
+                        new_depth = depths.get(new_url, depth + 1)
+                        new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
+                        await queue.put((new_score, new_depth, new_url, new_parent))
+
+        # End of crawl.
+
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Best-first crawl in batch mode.
+        
+        Aggregates all CrawlResults into a list.
+        """
+        results: List[CrawlResult] = []
+        async for result in self._arun_best_first(start_url, crawler, config):
+            results.append(result)
+        return results
+
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Best-first crawl in streaming mode.
+        
+        Yields CrawlResults as they become available.
+        """
+        async for result in self._arun_best_first(start_url, crawler, config):
+            yield result
+
+    async def arun(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: Optional[CrawlerRunConfig] = None,
+    ) -> "RunManyReturn":
+        """
+        Main entry point for best-first crawling.
+        
+        Returns either a list (batch mode) or an async generator (stream mode)
+        of CrawlResults.
+        """
+        if config is None:
+            raise ValueError("CrawlerRunConfig must be provided")
+        if config.stream:
+            return self._arun_stream(start_url, crawler, config)
+        else:
+            return await self._arun_batch(start_url, crawler, config)
+
+    async def shutdown(self) -> None:
+        """
+        Signal cancellation and clean up resources.
+        """
+        self._cancel_event.set()
+        self.stats.end_time = datetime.now()
+
+```
+
+
+## File: crawl4ai/deep_crawling/bfs_strategy.py
+
+```py
+# bfs_deep_crawl_strategy.py
+import asyncio
+import logging
+from datetime import datetime
+from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
+from urllib.parse import urlparse
+
+from ..models import TraversalStats
+from .filters import FilterChain
+from .scorers import URLScorer
+from . import DeepCrawlStrategy  
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
+from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
+from math import inf as infinity
+
+class BFSDeepCrawlStrategy(DeepCrawlStrategy):
+    """
+    Breadth-First Search deep crawling strategy.
+    
+    Core functions:
+      - arun: Main entry point; splits execution into batch or stream modes.
+      - link_discovery: Extracts, filters, and (if needed) scores the outgoing URLs.
+      - can_process_url: Validates URL format and applies the filter chain.
+    """
+    def __init__(
+        self,
+        max_depth: int,
+        filter_chain: FilterChain = FilterChain(),
+        url_scorer: Optional[URLScorer] = None,        
+        include_external: bool = False,
+        score_threshold: float = -infinity,
+        max_pages: int = infinity,
+        logger: Optional[logging.Logger] = None,
+    ):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.url_scorer = url_scorer
+        self.include_external = include_external
+        self.score_threshold = score_threshold
+        self.max_pages = max_pages
+        self.logger = logger or logging.getLogger(__name__)
+        self.stats = TraversalStats(start_time=datetime.now())
+        self._cancel_event = asyncio.Event()
+        self._pages_crawled = 0
+
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validates the URL and applies the filter chain.
+        For the start URL (depth 0) filtering is bypassed.
+        """
+        try:
+            parsed = urlparse(url)
+            if not parsed.scheme or not parsed.netloc:
+                raise ValueError("Missing scheme or netloc")
+            if parsed.scheme not in ("http", "https"):
+                raise ValueError("Invalid scheme")
+            if "." not in parsed.netloc:
+                raise ValueError("Invalid domain")
+        except Exception as e:
+            self.logger.warning(f"Invalid URL: {url}, error: {e}")
+            return False
+
+        if depth != 0 and not await self.filter_chain.apply(url):
+            return False
+
+        return True
+
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_level: List[Tuple[str, Optional[str]]],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extracts links from the crawl result, validates and scores them, and
+        prepares the next level of URLs.
+        Each valid URL is appended to next_level as a tuple (url, parent_url)
+        and its depth is tracked.
+        """            
+        next_depth = current_depth + 1
+        if next_depth > self.max_depth:
+            return
+
+        # If we've reached the max pages limit, don't discover new links
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery")
+            return
+
+        # Get internal links and, if enabled, external links.
+        links = result.links.get("internal", [])
+        if self.include_external:
+            links += result.links.get("external", [])
+
+        valid_links = []
+        
+        # First collect all valid links
+        for link in links:
+            url = link.get("href")
+            # Strip URL fragments to avoid duplicate crawling
+            # base_url = url.split('#')[0] if url else url
+            base_url = normalize_url_for_deep_crawl(url, source_url)
+            if base_url in visited:
+                continue
+            if not await self.can_process_url(url, next_depth):
+                self.stats.urls_skipped += 1
+                continue
+
+            # Score the URL if a scorer is provided
+            score = self.url_scorer.score(base_url) if self.url_scorer else 0
+            
+            # Skip URLs with scores below the threshold
+            if score < self.score_threshold:
+                self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
+                self.stats.urls_skipped += 1
+                continue
+            
+            valid_links.append((base_url, score))
+        
+        # If we have more valid links than capacity, sort by score and take the top ones
+        if len(valid_links) > remaining_capacity:
+            if self.url_scorer:
+                # Sort by score in descending order
+                valid_links.sort(key=lambda x: x[1], reverse=True)
+            # Take only as many as we have capacity for
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
+            
+        # Process the final selected links
+        for url, score in valid_links:
+            # attach the score to metadata if needed
+            if score:
+                result.metadata = result.metadata or {}
+                result.metadata["score"] = score
+            next_level.append((url, source_url))
+            depths[url] = next_depth
+
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Batch (non-streaming) mode:
+        Processes one BFS level at a time, then yields all the results.
+        """
+        visited: Set[str] = set()
+        # current_level holds tuples: (url, parent_url)
+        current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
+        depths: Dict[str, int] = {start_url: 0}
+
+        results: List[CrawlResult] = []
+
+        while current_level and not self._cancel_event.is_set():
+            next_level: List[Tuple[str, Optional[str]]] = []
+            urls = [url for url, _ in current_level]
+            visited.update(urls)
+
+            # Clone the config to disable deep crawling recursion and enforce batch mode.
+            batch_config = config.clone(deep_crawl_strategy=None, stream=False)
+            batch_results = await crawler.arun_many(urls=urls, config=batch_config)
+            
+            # Update pages crawled counter - count only successful crawls
+            successful_results = [r for r in batch_results if r.success]
+            self._pages_crawled += len(successful_results)
+            
+            for result in batch_results:
+                url = result.url
+                depth = depths.get(url, 0)
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                parent_url = next((parent for (u, parent) in current_level if u == url), None)
+                result.metadata["parent_url"] = parent_url
+                results.append(result)
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Link discovery will handle the max pages limit internally
+                    await self.link_discovery(result, url, depth, visited, next_level, depths)
+
+            current_level = next_level
+
+        return results
+
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Streaming mode:
+        Processes one BFS level at a time and yields results immediately as they arrive.
+        """
+        visited: Set[str] = set()
+        current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
+        depths: Dict[str, int] = {start_url: 0}
+
+        while current_level and not self._cancel_event.is_set():
+            next_level: List[Tuple[str, Optional[str]]] = []
+            urls = [url for url, _ in current_level]
+            visited.update(urls)
+
+            stream_config = config.clone(deep_crawl_strategy=None, stream=True)
+            stream_gen = await crawler.arun_many(urls=urls, config=stream_config)
+            
+            # Keep track of processed results for this batch
+            results_count = 0
+            async for result in stream_gen:
+                url = result.url
+                depth = depths.get(url, 0)
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                parent_url = next((parent for (u, parent) in current_level if u == url), None)
+                result.metadata["parent_url"] = parent_url
+                
+                # Count only successful crawls
+                if result.success:
+                    self._pages_crawled += 1
+                
+                results_count += 1
+                yield result
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Link discovery will handle the max pages limit internally
+                    await self.link_discovery(result, url, depth, visited, next_level, depths)
+            
+            # If we didn't get results back (e.g. due to errors), avoid getting stuck in an infinite loop
+            # by considering these URLs as visited but not counting them toward the max_pages limit
+            if results_count == 0 and urls:
+                self.logger.warning(f"No results returned for {len(urls)} URLs, marking as visited")
+                
+            current_level = next_level
+
+    async def shutdown(self) -> None:
+        """
+        Clean up resources and signal cancellation of the crawl.
+        """
+        self._cancel_event.set()
+        self.stats.end_time = datetime.now()
+
+```
+
+
+## File: crawl4ai/deep_crawling/filters.py
+
+```py
+from abc import ABC, abstractmethod
+from typing import List, Pattern, Set, Union
+from urllib.parse import urlparse
+from array import array
+import re
+import logging
+from functools import lru_cache
+import fnmatch
+from dataclasses import dataclass
+import weakref
+import math
+from collections import defaultdict
+from typing import Dict
+from ..utils import HeadPeekr
+import asyncio
+import inspect
+
+
+@dataclass
+class FilterStats:
+    __slots__ = ("_counters",)
+
+    def __init__(self):
+        # Use array of unsigned ints for atomic operations
+        self._counters = array("I", [0, 0, 0])  # total, passed, rejected
+
+    @property
+    def total_urls(self):
+        return self._counters[0]
+
+    @property
+    def passed_urls(self):
+        return self._counters[1]
+
+    @property
+    def rejected_urls(self):
+        return self._counters[2]
+
+
+class URLFilter(ABC):
+    """Optimized base filter class"""
+
+    __slots__ = ("name", "stats", "_logger_ref")
+
+    def __init__(self, name: str = None):
+        self.name = name or self.__class__.__name__
+        self.stats = FilterStats()
+        # Lazy logger initialization using weakref
+        self._logger_ref = None
+
+    @property
+    def logger(self):
+        if self._logger_ref is None or self._logger_ref() is None:
+            logger = logging.getLogger(f"urlfilter.{self.name}")
+            self._logger_ref = weakref.ref(logger)
+        return self._logger_ref()
+
+    @abstractmethod
+    def apply(self, url: str) -> bool:
+        pass
+
+    def _update_stats(self, passed: bool):
+        # Use direct array index for speed
+        self.stats._counters[0] += 1  # total
+        self.stats._counters[1] += passed  # passed
+        self.stats._counters[2] += not passed  # rejected
+
+
+class FilterChain:
+    """Optimized filter chain"""
+
+    __slots__ = ("filters", "stats", "_logger_ref")
+
+    def __init__(self, filters: List[URLFilter] = None):
+        self.filters = tuple(filters or [])  # Immutable tuple for speed
+        self.stats = FilterStats()
+        self._logger_ref = None
+
+    @property
+    def logger(self):
+        if self._logger_ref is None or self._logger_ref() is None:
+            logger = logging.getLogger("urlfilter.chain")
+            self._logger_ref = weakref.ref(logger)
+        return self._logger_ref()
+
+    def add_filter(self, filter_: URLFilter) -> "FilterChain":
+        """Add a filter to the chain"""
+        self.filters.append(filter_)
+        return self  # Enable method chaining
+
+    async def apply(self, url: str) -> bool:
+        """Apply all filters concurrently when possible"""
+        self.stats._counters[0] += 1  # Total processed URLs
+
+        tasks = []
+        for f in self.filters:
+            result = f.apply(url)
+
+            if inspect.isawaitable(result):
+                tasks.append(result)  # Collect async tasks
+            elif not result:  # Sync rejection
+                self.stats._counters[2] += 1  # Sync rejected
+                return False
+
+        if tasks:
+            results = await asyncio.gather(*tasks)
+
+            # Count how many filters rejected
+            rejections = results.count(False)
+            self.stats._counters[2] += rejections
+
+            if not all(results):
+                return False  # Stop early if any filter rejected
+
+        self.stats._counters[1] += 1  # Passed
+        return True
+
+
+class URLPatternFilter(URLFilter):
+    """Pattern filter balancing speed and completeness"""
+
+    __slots__ = (
+        "_simple_suffixes",
+        "_simple_prefixes",
+        "_domain_patterns",
+        "_path_patterns",
+        "_reverse",
+    )
+
+    PATTERN_TYPES = {
+        "SUFFIX": 1,  # *.html
+        "PREFIX": 2,  # /foo/*
+        "DOMAIN": 3,  # *.example.com
+        "PATH": 4,  # Everything else
+        "REGEX": 5,
+    }
+
+    def __init__(
+        self,
+        patterns: Union[str, Pattern, List[Union[str, Pattern]]],
+        use_glob: bool = True,
+        reverse: bool = False,
+    ):
+        super().__init__()
+        self._reverse = reverse
+        patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
+
+        self._simple_suffixes = set()
+        self._simple_prefixes = set()
+        self._domain_patterns = []
+        self._path_patterns = []
+
+        for pattern in patterns:
+            pattern_type = self._categorize_pattern(pattern)
+            self._add_pattern(pattern, pattern_type)
+
+    def _categorize_pattern(self, pattern: str) -> int:
+        """Categorize pattern for specialized handling"""
+        if not isinstance(pattern, str):
+            return self.PATTERN_TYPES["PATH"]
+
+        # Check if it's a regex pattern
+        if pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern:
+            return self.PATTERN_TYPES["REGEX"]
+
+        if pattern.count("*") == 1:
+            if pattern.startswith("*."):
+                return self.PATTERN_TYPES["SUFFIX"]
+            if pattern.endswith("/*"):
+                return self.PATTERN_TYPES["PREFIX"]
+
+        if "://" in pattern and pattern.startswith("*."):
+            return self.PATTERN_TYPES["DOMAIN"]
+
+        return self.PATTERN_TYPES["PATH"]
+
+    def _add_pattern(self, pattern: str, pattern_type: int):
+        """Add pattern to appropriate matcher"""
+        if pattern_type == self.PATTERN_TYPES["REGEX"]:
+            # For regex patterns, compile directly without glob translation
+            if isinstance(pattern, str) and (
+                pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern
+            ):
+                self._path_patterns.append(re.compile(pattern))
+                return
+        elif pattern_type == self.PATTERN_TYPES["SUFFIX"]:
+            self._simple_suffixes.add(pattern[2:])
+        elif pattern_type == self.PATTERN_TYPES["PREFIX"]:
+            self._simple_prefixes.add(pattern[:-2])
+        elif pattern_type == self.PATTERN_TYPES["DOMAIN"]:
+            self._domain_patterns.append(re.compile(pattern.replace("*.", r"[^/]+\.")))
+        else:
+            if isinstance(pattern, str):
+                # Handle complex glob patterns
+                if "**" in pattern:
+                    pattern = pattern.replace("**", ".*")
+                if "{" in pattern:
+                    # Convert {a,b} to (a|b)
+                    pattern = re.sub(
+                        r"\{([^}]+)\}",
+                        lambda m: f'({"|".join(m.group(1).split(","))})',
+                        pattern,
+                    )
+                pattern = fnmatch.translate(pattern)
+            self._path_patterns.append(
+                pattern if isinstance(pattern, Pattern) else re.compile(pattern)
+            )
+
+    @lru_cache(maxsize=10000)
+    def apply(self, url: str) -> bool:
+        # Quick suffix check (*.html)
+        if self._simple_suffixes:
+            path = url.split("?")[0]
+            if path.split("/")[-1].split(".")[-1] in self._simple_suffixes:
+                result = True
+                self._update_stats(result)
+                return not result if self._reverse else result
+
+        # Domain check
+        if self._domain_patterns:
+            for pattern in self._domain_patterns:
+                if pattern.match(url):
+                    result = True
+                    self._update_stats(result)
+                    return not result if self._reverse else result
+
+        # Prefix check (/foo/*)
+        if self._simple_prefixes:
+            path = url.split("?")[0]
+            if any(path.startswith(p) for p in self._simple_prefixes):
+                result = True
+                self._update_stats(result)
+                return not result if self._reverse else result
+
+        # Complex patterns
+        if self._path_patterns:
+            if any(p.search(url) for p in self._path_patterns):
+                result = True
+                self._update_stats(result)
+                return not result if self._reverse else result
+
+        result = False
+        self._update_stats(result)
+        return not result if self._reverse else result
+
+
+class ContentTypeFilter(URLFilter):
+    """Optimized content type filter using fast lookups"""
+
+    __slots__ = ("allowed_types", "_ext_map", "_check_extension")
+
+    # Fast extension to mime type mapping
+    _MIME_MAP = {
+        # Text Formats
+        "txt": "text/plain",
+        "html": "text/html",
+        "htm": "text/html",
+        "xhtml": "application/xhtml+xml",
+        "css": "text/css",
+        "csv": "text/csv",
+        "ics": "text/calendar",
+        "js": "application/javascript",
+        # Images
+        "bmp": "image/bmp",
+        "gif": "image/gif",
+        "jpeg": "image/jpeg",
+        "jpg": "image/jpeg",
+        "png": "image/png",
+        "svg": "image/svg+xml",
+        "tiff": "image/tiff",
+        "ico": "image/x-icon",
+        "webp": "image/webp",
+        # Audio
+        "mp3": "audio/mpeg",
+        "wav": "audio/wav",
+        "ogg": "audio/ogg",
+        "m4a": "audio/mp4",
+        "aac": "audio/aac",
+        # Video
+        "mp4": "video/mp4",
+        "mpeg": "video/mpeg",
+        "webm": "video/webm",
+        "avi": "video/x-msvideo",
+        "mov": "video/quicktime",
+        "flv": "video/x-flv",
+        "wmv": "video/x-ms-wmv",
+        "mkv": "video/x-matroska",
+        # Applications
+        "json": "application/json",
+        "xml": "application/xml",
+        "pdf": "application/pdf",
+        "zip": "application/zip",
+        "gz": "application/gzip",
+        "tar": "application/x-tar",
+        "rar": "application/vnd.rar",
+        "7z": "application/x-7z-compressed",
+        "exe": "application/vnd.microsoft.portable-executable",
+        "msi": "application/x-msdownload",
+        # Fonts
+        "woff": "font/woff",
+        "woff2": "font/woff2",
+        "ttf": "font/ttf",
+        "otf": "font/otf",
+        # Microsoft Office
+        "doc": "application/msword",
+        "dot": "application/msword",
+        "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        "xls": "application/vnd.ms-excel",
+        "ppt": "application/vnd.ms-powerpoint",
+        "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        # OpenDocument Formats
+        "odt": "application/vnd.oasis.opendocument.text",
+        "ods": "application/vnd.oasis.opendocument.spreadsheet",
+        "odp": "application/vnd.oasis.opendocument.presentation",
+        # Archives
+        "tar.gz": "application/gzip",
+        "tgz": "application/gzip",
+        "bz2": "application/x-bzip2",
+        # Others
+        "rtf": "application/rtf",
+        "apk": "application/vnd.android.package-archive",
+        "epub": "application/epub+zip",
+        "jar": "application/java-archive",
+        "swf": "application/x-shockwave-flash",
+        "midi": "audio/midi",
+        "mid": "audio/midi",
+        "ps": "application/postscript",
+        "ai": "application/postscript",
+        "eps": "application/postscript",
+        # Custom or less common
+        "bin": "application/octet-stream",
+        "dmg": "application/x-apple-diskimage",
+        "iso": "application/x-iso9660-image",
+        "deb": "application/x-debian-package",
+        "rpm": "application/x-rpm",
+        "sqlite": "application/vnd.sqlite3",
+        # Placeholder
+        "unknown": "application/octet-stream",  # Fallback for unknown file types
+    }
+
+    @staticmethod
+    @lru_cache(maxsize=1000)
+    def _extract_extension(url: str) -> str:
+        """Extracts file extension from a URL."""
+        # Remove scheme (http://, https://) if present
+        if "://" in url:
+            url = url.split("://", 1)[-1]  # Get everything after '://'
+
+        # Remove domain (everything up to the first '/')
+        path_start = url.find("/")
+        path = url[path_start:] if path_start != -1 else ""
+
+        # Extract last filename in path
+        filename = path.rsplit("/", 1)[-1] if "/" in path else ""
+
+        # Extract and validate extension
+        if "." not in filename:
+            return ""
+
+        return filename.rpartition(".")[-1].lower()
+
+    def __init__(
+        self,
+        allowed_types: Union[str, List[str]],
+        check_extension: bool = True,
+        ext_map: Dict[str, str] = _MIME_MAP,
+    ):
+        super().__init__()
+        # Normalize and store as frozenset for fast lookup
+        self.allowed_types = frozenset(
+            t.lower()
+            for t in (
+                allowed_types if isinstance(allowed_types, list) else [allowed_types]
+            )
+        )
+        self._check_extension = check_extension
+
+        # Pre-compute extension map for allowed types
+        self._ext_map = frozenset(
+            ext
+            for ext, mime in self._MIME_MAP.items()
+            if any(allowed in mime for allowed in self.allowed_types)
+        )
+
+    @lru_cache(maxsize=1000)
+    def _check_url_cached(self, url: str) -> bool:
+        """Cached URL checking"""
+        if not self._check_extension:
+            return True
+        ext = self._extract_extension(url)
+        if not ext:
+            return True
+
+        return ext in self._ext_map
+
+    def apply(self, url: str) -> bool:
+        """Fast extension check with caching"""
+        result = self._check_url_cached(url)
+        self._update_stats(result)
+        return result
+
+
+class DomainFilter(URLFilter):
+    """Optimized domain filter with fast lookups and caching"""
+
+    __slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache")
+
+    # Regex for fast domain extraction
+    _DOMAIN_REGEX = re.compile(r"://([^/]+)")
+
+    def __init__(
+        self,
+        allowed_domains: Union[str, List[str]] = None,
+        blocked_domains: Union[str, List[str]] = None,
+    ):
+        super().__init__()
+
+        # Convert inputs to frozensets for immutable, fast lookups
+        self._allowed_domains = (
+            frozenset(self._normalize_domains(allowed_domains))
+            if allowed_domains
+            else None
+        )
+        self._blocked_domains = (
+            frozenset(self._normalize_domains(blocked_domains))
+            if blocked_domains
+            else frozenset()
+        )
+
+    @staticmethod
+    def _normalize_domains(domains: Union[str, List[str]]) -> Set[str]:
+        """Fast domain normalization"""
+        if isinstance(domains, str):
+            return {domains.lower()}
+        return {d.lower() for d in domains}
+    
+    @staticmethod
+    def _is_subdomain(domain: str, parent_domain: str) -> bool:
+        """Check if domain is a subdomain of parent_domain"""
+        return domain == parent_domain or domain.endswith(f".{parent_domain}")
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _extract_domain(url: str) -> str:
+        """Ultra-fast domain extraction with regex and caching"""
+        match = DomainFilter._DOMAIN_REGEX.search(url)
+        return match.group(1).lower() if match else ""
+
+    def apply(self, url: str) -> bool:
+        """Optimized domain checking with early returns"""
+        # Skip processing if no filters
+        if not self._blocked_domains and self._allowed_domains is None:
+            self._update_stats(True)
+            return True
+
+        domain = self._extract_domain(url)
+
+        # Check for blocked domains, including subdomains
+        for blocked in self._blocked_domains:
+            if self._is_subdomain(domain, blocked):
+                self._update_stats(False)
+                return False
+
+        # If no allowed domains specified, accept all non-blocked
+        if self._allowed_domains is None:
+            self._update_stats(True)
+            return True
+
+        # Check if domain matches any allowed domain (including subdomains)
+        for allowed in self._allowed_domains:
+            if self._is_subdomain(domain, allowed):
+                self._update_stats(True)
+                return True
+
+        # No matches found
+        self._update_stats(False)
+        return False
+
+
+class ContentRelevanceFilter(URLFilter):
+    """BM25-based relevance filter using head section content"""
+
+    __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl")
+
+    def __init__(
+        self,
+        query: str,
+        threshold: float,
+        k1: float = 1.2,
+        b: float = 0.75,
+        avgdl: int = 1000,
+    ):
+        super().__init__(name="BM25RelevanceFilter")
+        self.query_terms = self._tokenize(query)
+        self.threshold = threshold
+        self.k1 = k1  # TF saturation parameter
+        self.b = b  # Length normalization parameter
+        self.avgdl = avgdl  # Average document length (empirical value)
+
+    async def apply(self, url: str) -> bool:
+        head_content = await HeadPeekr.peek_html(url)
+        if not head_content:
+            self._update_stats(False)
+            return False
+
+        # Field extraction with weighting
+        fields = {
+            "title": HeadPeekr.get_title(head_content) or "",
+            "meta": HeadPeekr.extract_meta_tags(head_content),
+        }
+        doc_text = self._build_document(fields)
+
+        score = self._bm25(doc_text)
+        decision = score >= self.threshold
+        self._update_stats(decision)
+        return decision
+
+    def _build_document(self, fields: Dict) -> str:
+        """Weighted document construction"""
+        return " ".join(
+            [
+                fields["title"] * 3,  # Title weight
+                fields["meta"].get("description", "") * 2,
+                fields["meta"].get("keywords", ""),
+                " ".join(fields["meta"].values()),
+            ]
+        )
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Fast case-insensitive tokenization"""
+        return text.lower().split()
+
+    def _bm25(self, document: str) -> float:
+        """Optimized BM25 implementation for head sections"""
+        doc_terms = self._tokenize(document)
+        doc_len = len(doc_terms)
+        tf = defaultdict(int)
+
+        for term in doc_terms:
+            tf[term] += 1
+
+        score = 0.0
+        for term in set(self.query_terms):
+            term_freq = tf[term]
+            idf = math.log((1 + 1) / (term_freq + 0.5) + 1)  # Simplified IDF
+            numerator = term_freq * (self.k1 + 1)
+            denominator = term_freq + self.k1 * (
+                1 - self.b + self.b * (doc_len / self.avgdl)
+            )
+            score += idf * (numerator / denominator)
+
+        return score
+
+
+class SEOFilter(URLFilter):
+    """Quantitative SEO quality assessment filter using head section analysis"""
+
+    __slots__ = ("threshold", "_weights", "_kw_patterns")
+
+    # Based on SEMrush/Google ranking factors research
+    DEFAULT_WEIGHTS = {
+        "title_length": 0.15,
+        "title_kw": 0.18,
+        "meta_description": 0.12,
+        "canonical": 0.10,
+        "robot_ok": 0.20,  # Most critical factor
+        "schema_org": 0.10,
+        "url_quality": 0.15,
+    }
+
+    def __init__(
+        self,
+        threshold: float = 0.65,
+        keywords: List[str] = None,
+        weights: Dict[str, float] = None,
+    ):
+        super().__init__(name="SEOFilter")
+        self.threshold = threshold
+        self._weights = weights or self.DEFAULT_WEIGHTS
+        self._kw_patterns = (
+            re.compile(
+                r"\b({})\b".format("|".join(map(re.escape, keywords or []))), re.I
+            )
+            if keywords
+            else None
+        )
+
+    async def apply(self, url: str) -> bool:
+        head_content = await HeadPeekr.peek_html(url)
+        if not head_content:
+            self._update_stats(False)
+            return False
+
+        meta = HeadPeekr.extract_meta_tags(head_content)
+        title = HeadPeekr.get_title(head_content) or ""
+        parsed_url = urlparse(url)
+
+        scores = {
+            "title_length": self._score_title_length(title),
+            "title_kw": self._score_keyword_presence(title),
+            "meta_description": self._score_meta_description(
+                meta.get("description", "")
+            ),
+            "canonical": self._score_canonical(meta.get("canonical"), url),
+            "robot_ok": 1.0 if "noindex" not in meta.get("robots", "") else 0.0,
+            "schema_org": self._score_schema_org(head_content),
+            "url_quality": self._score_url_quality(parsed_url),
+        }
+
+        total_score = sum(
+            weight * scores[factor] for factor, weight in self._weights.items()
+        )
+
+        decision = total_score >= self.threshold
+        self._update_stats(decision)
+        return decision
+
+    def _score_title_length(self, title: str) -> float:
+        length = len(title)
+        if 50 <= length <= 60:
+            return 1.0
+        if 40 <= length < 50 or 60 < length <= 70:
+            return 0.7
+        return 0.3  # Poor length
+
+    def _score_keyword_presence(self, text: str) -> float:
+        if not self._kw_patterns:
+            return 0.0
+        matches = len(self._kw_patterns.findall(text))
+        return min(matches * 0.3, 1.0)  # Max 3 matches
+
+    def _score_meta_description(self, desc: str) -> float:
+        length = len(desc)
+        if 140 <= length <= 160:
+            return 1.0
+        return 0.5 if 120 <= length <= 200 else 0.2
+
+    def _score_canonical(self, canonical: str, original: str) -> float:
+        if not canonical:
+            return 0.5  # Neutral score
+        return 1.0 if canonical == original else 0.2
+
+    def _score_schema_org(self, html: str) -> float:
+        # Detect any schema.org markup in head
+        return (
+            1.0
+            if re.search(r'<script[^>]+type=["\']application/ld\+json', html)
+            else 0.0
+        )
+
+    def _score_url_quality(self, parsed_url) -> float:
+        score = 1.0
+        path = parsed_url.path.lower()
+
+        # Penalty factors
+        if len(path) > 80:
+            score *= 0.7
+        if re.search(r"\d{4}", path):
+            score *= 0.8  # Numbers in path
+        if parsed_url.query:
+            score *= 0.6  # URL parameters
+        if "_" in path:
+            score *= 0.9  # Underscores vs hyphens
+
+        return score
+
+```
+
+
+## File: crawl4ai/deep_crawling/scorers.py
+
+```py
+from abc import ABC, abstractmethod
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+from urllib.parse import urlparse, unquote
+import re
+import logging
+from functools import lru_cache
+from array import array
+import ctypes
+import platform
+PLATFORM = platform.system()
+
+# Pre-computed scores for common year differences
+_SCORE_LOOKUP = [1.0, 0.5, 0.3333333333333333, 0.25]
+
+# Pre-computed scores for common year differences
+_FRESHNESS_SCORES = [
+   1.0,    # Current year
+   0.9,    # Last year
+   0.8,    # 2 years ago
+   0.7,    # 3 years ago
+   0.6,    # 4 years ago
+   0.5,    # 5 years ago
+]
+
+class ScoringStats:
+    __slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score')
+    
+    def __init__(self):
+        self._urls_scored = 0
+        self._total_score = 0.0
+        self._min_score = None  # Lazy initialization
+        self._max_score = None
+    
+    def update(self, score: float) -> None:
+        """Optimized update with minimal operations"""
+        self._urls_scored += 1
+        self._total_score += score
+        
+        # Lazy min/max tracking - only if actually accessed
+        if self._min_score is not None:
+            if score < self._min_score:
+                self._min_score = score
+        if self._max_score is not None:
+            if score > self._max_score:
+                self._max_score = score
+                
+    def get_average(self) -> float:
+        """Direct calculation instead of property"""
+        return self._total_score / self._urls_scored if self._urls_scored else 0.0
+    
+    def get_min(self) -> float:
+        """Lazy min calculation"""
+        if self._min_score is None:
+            self._min_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
+        return self._min_score
+        
+    def get_max(self) -> float:
+        """Lazy max calculation"""
+        if self._max_score is None:
+            self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
+        return self._max_score
+class URLScorer(ABC):
+    __slots__ = ('_weight', '_stats')
+    
+    def __init__(self, weight: float = 1.0):
+        # Store weight directly as float32 for memory efficiency
+        self._weight = ctypes.c_float(weight).value
+        self._stats = ScoringStats()
+    
+    @abstractmethod
+    def _calculate_score(self, url: str) -> float:
+        """Calculate raw score for URL."""
+        pass
+    
+    def score(self, url: str) -> float:
+        """Calculate weighted score with minimal overhead."""
+        score = self._calculate_score(url) * self._weight
+        self._stats.update(score)
+        return score
+    
+    @property
+    def stats(self):
+        """Access to scoring statistics."""
+        return self._stats
+    
+    @property
+    def weight(self):
+        return self._weight
+
+class CompositeScorer(URLScorer):
+    __slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array')
+    
+    def __init__(self, scorers: List[URLScorer], normalize: bool = True):
+        """Initialize composite scorer combining multiple scoring strategies.
+        
+        Optimized for:
+        - Fast parallel scoring
+        - Memory efficient score aggregation
+        - Quick short-circuit conditions
+        - Pre-allocated arrays
+        
+        Args:
+            scorers: List of scoring strategies to combine
+            normalize: Whether to normalize final score by scorer count
+        """
+        super().__init__(weight=1.0)
+        self._scorers = scorers
+        self._normalize = normalize
+        
+        # Pre-allocate arrays for scores and weights
+        self._weights_array = array('f', [s.weight for s in scorers])
+        self._score_array = array('f', [0.0] * len(scorers))
+
+    @lru_cache(maxsize=10000)
+    def _calculate_score(self, url: str) -> float:
+        """Calculate combined score from all scoring strategies.
+        
+        Uses:
+        1. Pre-allocated arrays for scores
+        2. Short-circuit on zero scores
+        3. Optimized normalization
+        4. Vectorized operations where possible
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Combined and optionally normalized score
+        """
+        total_score = 0.0
+        scores = self._score_array
+        
+        # Get scores from all scorers
+        for i, scorer in enumerate(self._scorers):
+            # Use public score() method which applies weight
+            scores[i] = scorer.score(url)
+            total_score += scores[i]
+            
+        # Normalize if requested
+        if self._normalize and self._scorers:
+            count = len(self._scorers)
+            return total_score / count
+            
+        return total_score
+
+    def score(self, url: str) -> float:
+        """Public scoring interface with stats tracking.
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Final combined score
+        """
+        score = self._calculate_score(url)
+        self.stats.update(score)
+        return score
+
+class KeywordRelevanceScorer(URLScorer):
+    __slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive')
+    
+    def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False):
+        super().__init__(weight=weight)
+        self._case_sensitive = case_sensitive
+        # Pre-process keywords once
+        self._keywords = [k if case_sensitive else k.lower() for k in keywords]
+    
+    @lru_cache(maxsize=10000)
+    def _url_bytes(self, url: str) -> bytes:
+        """Cache decoded URL bytes"""
+        return url.encode('utf-8') if self._case_sensitive else url.lower().encode('utf-8')
+    
+    
+    def _calculate_score(self, url: str) -> float:
+        """Fast string matching without regex or byte conversion"""
+        if not self._case_sensitive:
+            url = url.lower()
+            
+        matches = sum(1 for k in self._keywords if k in url)
+        
+        # Fast return paths
+        if not matches:
+            return 0.0
+        if matches == len(self._keywords):
+            return 1.0
+            
+        return matches / len(self._keywords)
+
+class PathDepthScorer(URLScorer):
+    __slots__ = ('_weight', '_stats', '_optimal_depth')  # Remove _url_cache
+    
+    def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
+        super().__init__(weight=weight)
+        self._optimal_depth = optimal_depth
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _quick_depth(path: str) -> int:
+        """Ultra fast path depth calculation.
+        
+        Examples:
+            - "http://example.com" -> 0  # No path segments
+            - "http://example.com/" -> 0  # Empty path
+            - "http://example.com/a" -> 1
+            - "http://example.com/a/b" -> 2
+        """
+        if not path or path == '/':
+            return 0
+            
+        if '/' not in path:
+            return 0
+            
+        depth = 0
+        last_was_slash = True
+        
+        for c in path:
+            if c == '/':
+                if not last_was_slash:
+                    depth += 1
+                last_was_slash = True
+            else:
+                last_was_slash = False
+                
+        if not last_was_slash:
+            depth += 1
+            
+        return depth
+
+    @lru_cache(maxsize=10000)  # Cache the whole calculation
+    def _calculate_score(self, url: str) -> float:
+        pos = url.find('/', url.find('://') + 3)
+        if pos == -1:
+            depth = 0
+        else:
+            depth = self._quick_depth(url[pos:])
+            
+        # Use lookup table for common distances
+        distance = depth - self._optimal_depth
+        distance = distance if distance >= 0 else -distance  # Faster than abs()
+        
+        if distance < 4:
+            return _SCORE_LOOKUP[distance]
+            
+        return 1.0 / (1.0 + distance)                                             
+
+class ContentTypeScorer(URLScorer):
+    __slots__ = ('_weight', '_exact_types', '_regex_types')
+
+    def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
+        """Initialize scorer with type weights map.
+        
+        Args:
+            type_weights: Dict mapping file extensions/patterns to scores (e.g. {'.html$': 1.0})
+            weight: Overall weight multiplier for this scorer
+        """
+        super().__init__(weight=weight)
+        self._exact_types = {}  # Fast lookup for simple extensions
+        self._regex_types = []  # Fallback for complex patterns
+        
+        # Split into exact vs regex matchers for performance
+        for pattern, score in type_weights.items():
+            if pattern.startswith('.') and pattern.endswith('$'):
+                ext = pattern[1:-1]
+                self._exact_types[ext] = score
+            else:
+                self._regex_types.append((re.compile(pattern), score))
+                
+        # Sort complex patterns by score for early exit
+        self._regex_types.sort(key=lambda x: -x[1])
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _quick_extension(url: str) -> str:
+        """Extract file extension ultra-fast without regex/splits.
+        
+        Handles:
+        - Basic extensions: "example.html" -> "html"
+        - Query strings: "page.php?id=1" -> "php" 
+        - Fragments: "doc.pdf#page=1" -> "pdf"
+        - Path params: "file.jpg;width=100" -> "jpg"
+        
+        Args:
+            url: URL to extract extension from
+            
+        Returns:
+            Extension without dot, or empty string if none found
+        """
+        pos = url.rfind('.')
+        if pos == -1:
+            return ''
+        
+        # Find first non-alphanumeric char after extension
+        end = len(url)
+        for i in range(pos + 1, len(url)):
+            c = url[i]
+            # Stop at query string, fragment, path param or any non-alphanumeric
+            if c in '?#;' or not c.isalnum():
+                end = i
+                break
+                
+        return url[pos + 1:end].lower()
+
+    @lru_cache(maxsize=10000)
+    def _calculate_score(self, url: str) -> float:
+        """Calculate content type score for URL.
+        
+        Uses staged approach:
+        1. Try exact extension match (fast path)
+        2. Fall back to regex patterns if needed
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Score between 0.0 and 1.0 * weight
+        """
+        # Fast path: direct extension lookup
+        ext = self._quick_extension(url)
+        if ext:
+            score = self._exact_types.get(ext, None)
+            if score is not None:
+                return score
+                
+        # Slow path: regex patterns
+        for pattern, score in self._regex_types:
+            if pattern.search(url):
+                return score
+
+        return 0.0
+
+class FreshnessScorer(URLScorer):
+    __slots__ = ('_weight', '_date_pattern', '_current_year')
+
+    def __init__(self, weight: float = 1.0, current_year: int = 2024):
+        """Initialize freshness scorer.
+        
+        Extracts and scores dates from URLs using format:
+        - YYYY/MM/DD 
+        - YYYY-MM-DD
+        - YYYY_MM_DD
+        - YYYY (year only)
+        
+        Args:
+            weight: Score multiplier
+            current_year: Year to calculate freshness against (default 2024)
+        """
+        super().__init__(weight=weight)
+        self._current_year = current_year
+        
+        # Combined pattern for all date formats
+        # Uses non-capturing groups (?:) and alternation
+        self._date_pattern = re.compile(
+            r'(?:/'  # Path separator
+            r'|[-_])'  # or date separators
+            r'((?:19|20)\d{2})'  # Year group (1900-2099)
+            r'(?:'  # Optional month/day group
+            r'(?:/|[-_])'  # Date separator  
+            r'(?:\d{2})'  # Month
+            r'(?:'  # Optional day
+            r'(?:/|[-_])'  # Date separator
+            r'(?:\d{2})'  # Day
+            r')?'  # Day is optional
+            r')?'  # Month/day group is optional
+        )
+
+    @lru_cache(maxsize=10000)
+    def _extract_year(self, url: str) -> Optional[int]:
+        """Extract the most recent year from URL.
+        
+        Args:
+            url: URL to extract year from
+            
+        Returns:
+            Year as int or None if no valid year found
+        """
+        matches = self._date_pattern.finditer(url)
+        latest_year = None
+        
+        # Find most recent year
+        for match in matches:
+            year = int(match.group(1))
+            if (year <= self._current_year and  # Sanity check
+                (latest_year is None or year > latest_year)):
+                latest_year = year
+                
+        return latest_year
+
+    @lru_cache(maxsize=10000) 
+    def _calculate_score(self, url: str) -> float:
+        """Calculate freshness score based on URL date.
+        
+        More recent years score higher. Uses pre-computed scoring
+        table for common year differences.
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Score between 0.0 and 1.0 * weight
+        """
+        year = self._extract_year(url)
+        if year is None:
+            return 0.5  # Default score
+            
+        # Use lookup table for common year differences
+        year_diff = self._current_year - year
+        if year_diff < len(_FRESHNESS_SCORES):
+            return _FRESHNESS_SCORES[year_diff]
+            
+        # Fallback calculation for older content
+        return max(0.1, 1.0 - year_diff * 0.1)
+
+class DomainAuthorityScorer(URLScorer):
+    __slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains')
+    
+    def __init__(
+        self,
+        domain_weights: Dict[str, float],
+        default_weight: float = 0.5,
+        weight: float = 1.0,
+    ):
+        """Initialize domain authority scorer.
+        
+        Args:
+            domain_weights: Dict mapping domains to authority scores
+            default_weight: Score for unknown domains
+            weight: Overall scorer weight multiplier
+            
+        Example:
+            {
+                'python.org': 1.0,
+                'github.com': 0.9,
+                'medium.com': 0.7
+            }
+        """
+        super().__init__(weight=weight)
+        
+        # Pre-process domains for faster lookup
+        self._domain_weights = {
+            domain.lower(): score 
+            for domain, score in domain_weights.items()
+        }
+        self._default_weight = default_weight
+        
+        # Cache top domains for fast path
+        self._top_domains = {
+            domain: score
+            for domain, score in sorted(
+                domain_weights.items(), 
+                key=lambda x: -x[1]
+            )[:5]  # Keep top 5 highest scoring domains
+        }
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _extract_domain(url: str) -> str:
+        """Extract domain from URL ultra-fast.
+        
+        Handles:
+        - Basic domains: "example.com"
+        - Subdomains: "sub.example.com" 
+        - Ports: "example.com:8080"
+        - IPv4: "192.168.1.1"
+        
+        Args:
+            url: Full URL to extract domain from
+            
+        Returns:
+            Lowercase domain without port
+        """
+        # Find domain start
+        start = url.find('://') 
+        if start == -1:
+            start = 0
+        else:
+            start += 3
+            
+        # Find domain end
+        end = url.find('/', start)
+        if end == -1:
+            end = url.find('?', start)
+            if end == -1:
+                end = url.find('#', start)
+                if end == -1:
+                    end = len(url)
+                    
+        # Extract domain and remove port
+        domain = url[start:end]
+        port_idx = domain.rfind(':')
+        if port_idx != -1:
+            domain = domain[:port_idx]
+            
+        return domain.lower()
+
+    @lru_cache(maxsize=10000)
+    def _calculate_score(self, url: str) -> float:
+        """Calculate domain authority score.
+        
+        Uses staged approach:
+        1. Check top domains (fastest)
+        2. Check full domain weights
+        3. Return default weight
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Authority score between 0.0 and 1.0 * weight
+        """
+        domain = self._extract_domain(url)
+        
+        # Fast path: check top domains first
+        score = self._top_domains.get(domain)
+        if score is not None:
+            return score
+            
+        # Regular path: check all domains
+        return self._domain_weights.get(domain, self._default_weight)
+```
+
+
+## File: docs/examples/deepcrawl_example.py
+
+```py
+import asyncio
+import time
+
+from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter,
+    ContentRelevanceFilter,
+    SEOFilter,
+)
+from crawl4ai.deep_crawling.scorers import (
+    KeywordRelevanceScorer,
+)
+
+
+# 1️⃣ Basic Deep Crawl Setup
+async def basic_deep_crawl():
+    """
+    PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.
+
+    This function shows:
+    - How to set up BFSDeepCrawlStrategy (Breadth-First Search)
+    - Setting depth and domain parameters
+    - Processing the results to show the hierarchy
+    """
+    print("\n===== BASIC DEEP CRAWL SETUP =====")
+
+    # Configure a 2-level deep crawl using Breadth-First Search strategy
+    # max_depth=2 means: initial page (depth 0) + 2 more levels
+    # include_external=False means: only follow links within the same domain
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=True,  # Show progress during crawling
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        start_time = time.perf_counter()
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        # Group results by depth to visualize the crawl tree
+        pages_by_depth = {}
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            if depth not in pages_by_depth:
+                pages_by_depth[depth] = []
+            pages_by_depth[depth].append(result.url)
+
+        print(f"✅ Crawled {len(results)} pages total")
+
+        # Display crawl structure by depth
+        for depth, urls in sorted(pages_by_depth.items()):
+            print(f"\nDepth {depth}: {len(urls)} pages")
+            # Show first 3 URLs for each depth as examples
+            for url in urls[:3]:
+                print(f"  → {url}")
+            if len(urls) > 3:
+                print(f"  ... and {len(urls) - 3} more")
+
+        print(
+            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
+        )
+
+# 2️⃣ Stream vs. Non-Stream Execution
+async def stream_vs_nonstream():
+    """
+    PART 2: Demonstrates the difference between stream and non-stream execution.
+
+    Non-stream: Waits for all results before processing
+    Stream: Processes results as they become available
+    """
+    print("\n===== STREAM VS. NON-STREAM EXECUTION =====")
+
+    # Common configuration for both examples
+    base_config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=False,
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        # NON-STREAMING MODE
+        print("\n📊 NON-STREAMING MODE:")
+        print("  In this mode, all results are collected before being returned.")
+
+        non_stream_config = base_config.clone()
+        non_stream_config.stream = False
+
+        start_time = time.perf_counter()
+        results = await crawler.arun(
+            url="https://docs.crawl4ai.com", config=non_stream_config
+        )
+
+        print(f"  ✅ Received all {len(results)} results at once")
+        print(f"  ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")
+
+        # STREAMING MODE
+        print("\n📊 STREAMING MODE:")
+        print("  In this mode, results are processed as they become available.")
+
+        stream_config = base_config.clone()
+        stream_config.stream = True
+
+        start_time = time.perf_counter()
+        result_count = 0
+        first_result_time = None
+
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=stream_config
+        ):
+            result_count += 1
+            if result_count == 1:
+                first_result_time = time.perf_counter() - start_time
+                print(
+                    f"  ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
+                )
+            elif result_count % 5 == 0:  # Show every 5th result for brevity
+                print(f"  → Result #{result_count}: {result.url}")
+
+        print(f"  ✅ Total: {result_count} results")
+        print(f"  ✅ First result: {first_result_time:.2f} seconds")
+        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
+        print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
+
+# 3️⃣ Introduce Filters & Scorers
+async def filters_and_scorers():
+    """
+    PART 3: Demonstrates the use of filters and scorers for more targeted crawling.
+
+    This function progressively adds:
+    1. A single URL pattern filter
+    2. Multiple filters in a chain
+    3. Scorers for prioritizing pages
+    """
+    print("\n===== FILTERS AND SCORERS =====")
+
+    async with AsyncWebCrawler() as crawler:
+        # SINGLE FILTER EXAMPLE
+        print("\n📊 EXAMPLE 1: SINGLE URL PATTERN FILTER")
+        print("  Only crawl pages containing 'core' in the URL")
+
+        # Create a filter that only allows URLs with 'guide' in them
+        url_filter = URLPatternFilter(patterns=["*core*"])
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1,
+                include_external=False,
+                filter_chain=FilterChain([url_filter]),  # Single filter
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            cache_mode=CacheMode.BYPASS,
+            verbose=True,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Crawled {len(results)} pages matching '*core*'")
+        for result in results[:3]:  # Show first 3 results
+            print(f"  → {result.url}")
+        if len(results) > 3:
+            print(f"  ... and {len(results) - 3} more")
+
+        # MULTIPLE FILTERS EXAMPLE
+        print("\n📊 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
+        print("  Only crawl pages that:")
+        print("  1. Contain '2024' in the URL")
+        print("  2. Are from 'techcrunch.com'")
+        print("  3. Are of text/html or application/javascript content type")
+
+        # Create a chain of filters
+        filter_chain = FilterChain(
+            [
+                URLPatternFilter(patterns=["*2024*"]),
+                DomainFilter(
+                    allowed_domains=["techcrunch.com"],
+                    blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
+                ),
+                ContentTypeFilter(
+                    allowed_types=["text/html", "application/javascript"]
+                ),
+            ]
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, include_external=False, filter_chain=filter_chain
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+        )
+
+        results = await crawler.arun(url="https://techcrunch.com", config=config)
+
+        print(f"  ✅ Crawled {len(results)} pages after applying all filters")
+        for result in results[:3]:
+            print(f"  → {result.url}")
+        if len(results) > 3:
+            print(f"  ... and {len(results) - 3} more")
+
+        # SCORERS EXAMPLE
+        print("\n📊 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
+        print(
+            "Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
+        )
+
+        # Create a keyword relevance scorer
+        keyword_scorer = KeywordRelevanceScorer(
+            keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BestFirstCrawlingStrategy(  
+                max_depth=1, include_external=False, url_scorer=keyword_scorer
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            cache_mode=CacheMode.BYPASS,
+            verbose=True,
+            stream=True,
+        )
+
+        results = []
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score")
+            print(f"  → Score: {score:.2f} | {result.url}")
+
+        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
+        print("  🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
+
+# 4️⃣ Advanced Filters
+async def advanced_filters():
+    """
+    PART 4: Demonstrates advanced filtering techniques for specialized crawling.
+
+    This function covers:
+    - SEO filters
+    - Text relevancy filtering
+    - Combining advanced filters
+    """
+    print("\n===== ADVANCED FILTERS =====")
+
+    async with AsyncWebCrawler() as crawler:
+        # SEO FILTER EXAMPLE
+        print("\n📊 EXAMPLE 1: SEO FILTERS")
+        print(
+            "Quantitative SEO quality assessment filter based searching keywords in the head section"
+        )
+
+        seo_filter = SEOFilter(
+            threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, filter_chain=FilterChain([seo_filter])
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Found {len(results)} pages with relevant keywords")
+        for result in results:
+            print(f"  → {result.url}")
+
+        # ADVANCED TEXT RELEVANCY FILTER
+        print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")
+
+        # More sophisticated content relevance filter
+        relevance_filter = ContentRelevanceFilter(
+            query="Interact with the web using your authentic digital identity",
+            threshold=0.7,
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, filter_chain=FilterChain([relevance_filter])
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Found {len(results)} pages")
+        for result in results:
+            relevance_score = result.metadata.get("relevance_score", 0)
+            print(f"  → Score: {relevance_score:.2f} | {result.url}")
+
+# 5️⃣ Max Pages and Score Thresholds
+async def max_pages_and_thresholds():
+    """
+    PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
+    
+    This function shows:
+    - How to limit the number of pages crawled
+    - How to set score thresholds for more targeted crawling
+    - Comparing BFS, DFS, and Best-First strategies with these parameters
+    """
+    print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
+    
+    from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
+    
+    async with AsyncWebCrawler() as crawler:
+        # Define a common keyword scorer for all examples
+        keyword_scorer = KeywordRelevanceScorer(
+            keywords=["browser", "crawler", "web", "automation"], 
+            weight=1.0
+        )
+        
+        # EXAMPLE 1: BFS WITH MAX PAGES
+        print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
+        print("  Limit the crawler to a maximum of 5 pages")
+        
+        bfs_config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=2, 
+                include_external=False,
+                url_scorer=keyword_scorer,
+                max_pages=5  # Only crawl 5 pages
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+        
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
+        
+        print(f"  ✅ Crawled exactly {len(results)} pages as specified by max_pages")
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | {result.url}")
+            
+        # EXAMPLE 2: DFS WITH SCORE THRESHOLD
+        print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
+        print("  Only crawl pages with a relevance score above 0.5")
+        
+        dfs_config = CrawlerRunConfig(
+            deep_crawl_strategy=DFSDeepCrawlStrategy(
+                max_depth=2,
+                include_external=False, 
+                url_scorer=keyword_scorer,
+                score_threshold=0.7,  # Only process URLs with scores above 0.5
+                max_pages=10
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+        
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
+        
+        print(f"  ✅ Crawled {len(results)} pages with scores above threshold")
+        for result in results:
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
+            
+        # EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
+        print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
+        print("  Limit to 7 pages with scores above 0.3, prioritizing highest scores")
+        
+        bf_config = CrawlerRunConfig(
+            deep_crawl_strategy=BestFirstCrawlingStrategy(
+                max_depth=2,
+                include_external=False,
+                url_scorer=keyword_scorer,
+                max_pages=7,          # Limit to 7 pages total
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+            stream=True,
+        )
+        
+        results = []
+        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
+            
+        print(f"  ✅ Crawled {len(results)} high-value pages with scores above 0.3")
+        if results:
+            avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
+            print(f"  ✅ Average score: {avg_score:.2f}")
+            print("  🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
+
+# 6️⃣ Wrap-Up and Key Takeaways
+async def wrap_up():
+    """
+    PART 6: Wrap-Up and Key Takeaways
+
+    Summarize the key concepts learned in this tutorial.
+    """
+    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
+    print("Combining filters, scorers, and streaming for an optimized crawl")
+
+    # Create a sophisticated filter chain
+    filter_chain = FilterChain(
+        [
+            DomainFilter(
+                allowed_domains=["docs.crawl4ai.com"],
+                blocked_domains=["old.docs.crawl4ai.com"],
+            ),
+            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
+            ContentTypeFilter(allowed_types=["text/html"]),
+        ]
+    )
+
+    # Create a composite scorer that combines multiple scoring strategies
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"], weight=0.7
+    )
+    # Set up the configuration
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=1,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer,
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True,
+    )
+
+    # Execute the crawl
+    results = []
+    start_time = time.perf_counter()
+
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
+
+    duration = time.perf_counter() - start_time
+
+    # Summarize the results
+    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
+    print(
+        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
+    )
+
+    # Group by depth
+    depth_counts = {}
+    for result in results:
+        depth = result.metadata.get("depth", 0)
+        depth_counts[depth] = depth_counts.get(depth, 0) + 1
+
+    print("\n📊 Pages crawled by depth:")
+    for depth, count in sorted(depth_counts.items()):
+        print(f"  Depth {depth}: {count} pages")
+
+async def run_tutorial():
+    """
+    Executes all tutorial sections in sequence.
+    """
+    print("\n🚀 CRAWL4AI DEEP CRAWLING TUTORIAL 🚀")
+    print("======================================")
+    print("This tutorial will walk you through deep crawling techniques,")
+    print("from basic to advanced, using the Crawl4AI library.")
+
+    # Define sections - uncomment to run specific parts during development
+    tutorial_sections = [
+        basic_deep_crawl,
+        stream_vs_nonstream,
+        filters_and_scorers,
+        max_pages_and_thresholds, 
+        advanced_filters,
+        wrap_up,
+    ]
+
+    for section in tutorial_sections:
+        await section()
+
+    print("\n🎉 TUTORIAL COMPLETE! 🎉")
+    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
+    print("For more information, check out https://docs.crawl4ai.com")
+
+# Execute the tutorial when run directly
+if __name__ == "__main__":
+    asyncio.run(run_tutorial())
+```
diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md
new file mode 100644
index 00000000..1642f85e
--- /dev/null
+++ b/deploy/docker/c4ai-doc-context.md
@@ -0,0 +1,8899 @@
+# Crawl4AI Doc Context
+
+Generated on 2025-04-21
+
+## File: docs/md_v2/core/ask-ai.md
+
+```md
+<div class="ask-ai-container">
+<iframe id="ask-ai-frame" src="../../ask_ai/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI Assistant"></iframe>
+</div>
+
+<script>
+// Iframe height adjustment
+function resizeAskAiIframe() {
+  const iframe = document.getElementById('ask-ai-frame');
+  if (iframe) {
+    const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
+    // Footer is removed by JS below, so calculate height based on header + small buffer
+    const topOffset = headerHeight + 20; // Header + buffer/margin
+
+    const availableHeight = window.innerHeight - topOffset;
+    iframe.style.height = Math.max(600, availableHeight) + 'px'; // Min height 600px
+  }
+}
+
+// Run immediately and on resize/load
+resizeAskAiIframe(); // Initial call
+let resizeTimer;
+window.addEventListener('load', resizeAskAiIframe);
+window.addEventListener('resize', () => {
+    clearTimeout(resizeTimer);
+    resizeTimer = setTimeout(resizeAskAiIframe, 150);
+});
+
+// Remove Footer & HR from parent page (DOM Ready might be safer)
+document.addEventListener('DOMContentLoaded', () => {
+    setTimeout(() => { // Add slight delay just in case elements render slowly
+        const footer = window.parent.document.querySelector('footer'); // Target parent document
+        if (footer) {
+            const hrBeforeFooter = footer.previousElementSibling;
+            if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
+                hrBeforeFooter.remove();
+            }
+            footer.remove();
+            // Trigger resize again after removing footer
+            resizeAskAiIframe();
+        } else {
+             console.warn("Ask AI Page: Could not find footer in parent document to remove.");
+        }
+    }, 100); // Shorter delay
+});
+</script>
+
+<style>
+#terminal-mkdocs-main-content {
+    padding: 0 !important;
+    margin: 0;
+    width: 100%;
+    height: 100%;
+    overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
+}
+
+/* Ensure iframe container takes full space */
+#terminal-mkdocs-main-content .ask-ai-container {
+    /* Remove negative margins if footer removal handles space */
+     margin: 0;
+    padding: 0;
+    max-width: none;
+    /* Let the JS set the height */
+    /* height: 600px; Initial fallback height */
+    overflow: hidden; /* Hide potential overflow before JS resize */
+}
+
+/* Hide title/paragraph if they were part of the markdown */
+/* Alternatively, just remove them from the .md file directly */
+/* #terminal-mkdocs-main-content > h1,
+#terminal-mkdocs-main-content > p:first-of-type {
+    display: none;
+} */
+
+</style>
+
+```
+
+
+## File: docs/md_v2/core/browser-crawler-config.md
+
+```md
+# Browser, Crawler & LLM Configuration (Quick Overview)
+
+Crawl4AI’s flexibility stems from two key classes:
+
+1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).  
+2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).  
+3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
+
+In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
+
+---
+
+## 1. BrowserConfig Essentials
+
+```python
+class BrowserConfig:
+    def __init__(
+        browser_type="chromium",
+        headless=True,
+        proxy_config=None,
+        viewport_width=1080,
+        viewport_height=600,
+        verbose=True,
+        use_persistent_context=False,
+        user_data_dir=None,
+        cookies=None,
+        headers=None,
+        user_agent=None,
+        text_mode=False,
+        light_mode=False,
+        extra_args=None,
+        # ... other advanced parameters omitted here
+    ):
+        ...
+```
+
+### Key Fields to Note
+
+
+
+1. **`browser_type`**  
+- Options: `"chromium"`, `"firefox"`, or `"webkit"`.  
+- Defaults to `"chromium"`.  
+- If you need a different engine, specify it here.
+
+2. **`headless`**  
+   - `True`: Runs the browser in headless mode (invisible browser).  
+   - `False`: Runs the browser in visible mode, which helps with debugging.
+
+3. **`proxy_config`**  
+   - A dictionary with fields like:  
+```json
+{
+    "server": "http://proxy.example.com:8080", 
+    "username": "...", 
+    "password": "..."
+}
+```
+   - Leave as `None` if a proxy is not required.
+
+4. **`viewport_width` & `viewport_height`**:  
+   - The initial window size.  
+   - Some sites behave differently with smaller or bigger viewports.
+
+5. **`verbose`**:  
+   - If `True`, prints extra logs.  
+   - Handy for debugging.
+
+6. **`use_persistent_context`**:  
+   - If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs.  
+   - Typically also set `user_data_dir` to point to a folder.
+
+7. **`cookies`** & **`headers`**:  
+   - If you want to start with specific cookies or add universal HTTP headers, set them here.  
+   - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
+
+8. **`user_agent`**:  
+   - Custom User-Agent string. If `None`, a default is used.  
+   - You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection).
+
+9. **`text_mode`** & **`light_mode`**:  
+   - `text_mode=True` disables images, possibly speeding up text-only crawls.  
+   - `light_mode=True` turns off certain background features for performance.  
+
+10. **`extra_args`**:  
+    - Additional flags for the underlying browser.  
+    - E.g. `["--disable-extensions"]`.
+
+### Helper Methods
+
+Both configuration classes provide a `clone()` method to create modified copies:
+
+```python
+# Create a base browser config
+base_browser = BrowserConfig(
+    browser_type="chromium",
+    headless=True,
+    text_mode=True
+)
+
+# Create a visible browser config for debugging
+debug_browser = base_browser.clone(
+    headless=False,
+    verbose=True
+)
+```
+
+**Minimal Example**:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+browser_conf = BrowserConfig(
+    browser_type="firefox",
+    headless=False,
+    text_mode=True
+)
+
+async with AsyncWebCrawler(config=browser_conf) as crawler:
+    result = await crawler.arun("https://example.com")
+    print(result.markdown[:300])
+```
+
+---
+
+## 2. CrawlerRunConfig Essentials
+
+```python
+class CrawlerRunConfig:
+    def __init__(
+        word_count_threshold=200,
+        extraction_strategy=None,
+        markdown_generator=None,
+        cache_mode=None,
+        js_code=None,
+        wait_for=None,
+        screenshot=False,
+        pdf=False,
+        capture_mhtml=False,
+        enable_rate_limiting=False,
+        rate_limit_config=None,
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=20,
+        display_mode=None,
+        verbose=True,
+        stream=False,  # Enable streaming for arun_many()
+        # ... other advanced parameters omitted
+    ):
+        ...
+```
+
+### Key Fields to Note
+
+1. **`word_count_threshold`**:  
+   - The minimum word count before a block is considered.  
+   - If your site has lots of short paragraphs or items, you can lower it.
+
+2. **`extraction_strategy`**:  
+   - Where you plug in JSON-based extraction (CSS, LLM, etc.).  
+   - If `None`, no structured extraction is done (only raw/cleaned HTML + markdown).
+
+3. **`markdown_generator`**:  
+   - E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done.  
+   - If `None`, a default approach is used.
+
+4. **`cache_mode`**:  
+   - Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).  
+   - If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`.
+
+5. **`js_code`**:  
+   - A string or list of JS strings to execute.  
+   - Great for “Load More” buttons or user interactions.  
+
+6. **`wait_for`**:  
+   - A CSS or JS expression to wait for before extracting content.  
+   - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
+
+7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:  
+   - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.  
+   - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
+8. **`verbose`**:  
+   - Logs additional runtime details.  
+   - Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`.
+
+9. **`enable_rate_limiting`**:  
+   - If `True`, enables rate limiting for batch processing.  
+   - Requires `rate_limit_config` to be set.
+
+10. **`memory_threshold_percent`**:  
+    - The memory threshold (as a percentage) to monitor.  
+    - If exceeded, the crawler will pause or slow down.
+
+11. **`check_interval`**:  
+    - The interval (in seconds) to check system resources.  
+    - Affects how often memory and CPU usage are monitored.
+
+12. **`max_session_permit`**:  
+    - The maximum number of concurrent crawl sessions.  
+    - Helps prevent overwhelming the system.
+
+13. **`display_mode`**:  
+    - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).  
+    - Affects how much information is printed during the crawl.
+
+### Helper Methods
+
+The `clone()` method is particularly useful for creating variations of your crawler configuration:
+
+```python
+# Create a base configuration
+base_config = CrawlerRunConfig(
+    cache_mode=CacheMode.ENABLED,
+    word_count_threshold=200,
+    wait_until="networkidle"
+)
+
+# Create variations for different use cases
+stream_config = base_config.clone(
+    stream=True,  # Enable streaming mode
+    cache_mode=CacheMode.BYPASS
+)
+
+debug_config = base_config.clone(
+    page_timeout=120000,  # Longer timeout for debugging
+    verbose=True
+)
+```
+
+The `clone()` method:
+- Creates a new instance with all the same settings
+- Updates only the specified parameters
+- Leaves the original configuration unchanged
+- Perfect for creating variations without repeating all parameters
+
+---
+
+
+
+
+
+## 3. LLMConfig Essentials
+
+### Key fields to note
+
+1. **`provider`**:  
+- Which LLM provoder to use. 
+- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
+
+2. **`api_token`**:  
+    - Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables  
+    - API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
+    - Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`            
+
+3. **`base_url`**:  
+   - If your provider has a custom endpoint
+
+```python
+llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+```
+
+## 4. Putting It All Together
+
+In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call’s needs:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    # 1) Browser config: headless, bigger viewport, no proxy
+    browser_conf = BrowserConfig(
+        headless=True,
+        viewport_width=1280,
+        viewport_height=720
+    )
+
+    # 2) Example extraction strategy
+    schema = {
+        "name": "Articles",
+        "baseSelector": "div.article",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+        ]
+    }
+    extraction = JsonCssExtractionStrategy(schema)
+
+    # 3) Example LLM content filtering
+
+    gemini_config = LLMConfig(
+        provider="gemini/gemini-1.5-pro" 
+        api_token = "env:GEMINI_API_TOKEN"
+    )
+
+    # Initialize LLM filter with specific instruction
+    filter = LLMContentFilter(
+        llm_config=gemini_config,  # or your preferred provider
+        instruction="""
+        Focus on extracting the core educational content.
+        Include:
+        - Key concepts and explanations
+        - Important code examples
+        - Essential technical details
+        Exclude:
+        - Navigation elements
+        - Sidebars
+        - Footer content
+        Format the output as clean markdown with proper code blocks and headers.
+        """,
+        chunk_token_threshold=500,  # Adjust based on your needs
+        verbose=True
+    )
+
+    md_generator = DefaultMarkdownGenerator(
+    content_filter=filter,
+    options={"ignore_links": True}
+
+    # 4) Crawler run config: skip cache, use extraction
+    run_conf = CrawlerRunConfig(
+        markdown_generator=md_generator,
+        extraction_strategy=extraction,
+        cache_mode=CacheMode.BYPASS,
+    )
+
+    async with AsyncWebCrawler(config=browser_conf) as crawler:
+        # 4) Execute the crawl
+        result = await crawler.arun(url="https://example.com/news", config=run_conf)
+
+        if result.success:
+            print("Extracted content:", result.extracted_content)
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 5. Next Steps
+
+For a **detailed list** of available parameters (including advanced ones), see:
+
+- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md)  
+
+You can explore topics like:
+
+- **Custom Hooks & Auth** (Inject JavaScript or handle login forms).  
+- **Session Management** (Re-use pages, preserve state across multiple calls).  
+- **Magic Mode** or **Identity-based Crawling** (Fight bot detection by simulating user behavior).  
+- **Advanced Caching** (Fine-tune read/write cache modes).  
+
+---
+
+## 6. Conclusion
+
+**BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define:
+
+- **Which** browser to launch, how it should run, and any proxy or user agent needs.  
+- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.
+- **Which** LLM provider to use, api token, temperature and base url for custom endpoints
+
+Use them together for **clear, maintainable** code, and when you need more specialized behavior, check out the advanced parameters in the [reference docs](../api/parameters.md). Happy crawling!
+```
+
+
+## File: docs/md_v2/core/cache-modes.md
+
+```md
+# Crawl4AI Cache System and Migration Guide
+
+## Overview
+Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
+
+## Old vs New Approach
+
+### Old Way (Deprecated)
+The old system used multiple boolean flags:
+- `bypass_cache`: Skip cache entirely
+- `disable_cache`: Disable all caching
+- `no_cache_read`: Don't read from cache
+- `no_cache_write`: Don't write to cache
+
+### New Way (Recommended)
+The new system uses a single `CacheMode` enum:
+- `CacheMode.ENABLED`: Normal caching (read/write)
+- `CacheMode.DISABLED`: No caching at all
+- `CacheMode.READ_ONLY`: Only read from cache
+- `CacheMode.WRITE_ONLY`: Only write to cache
+- `CacheMode.BYPASS`: Skip cache for this operation
+
+## Migration Example
+
+### Old Code (Deprecated)
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def use_proxy():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            bypass_cache=True  # Old way
+        )
+        print(len(result.markdown))
+
+async def main():
+    await use_proxy()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### New Code (Recommended)
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def use_proxy():
+    # Use CacheMode in CrawlerRunConfig
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)  
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=config  # Pass the configuration object
+        )
+        print(len(result.markdown))
+
+async def main():
+    await use_proxy()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Common Migration Patterns
+
+| Old Flag              | New Mode                       |
+|-----------------------|---------------------------------|
+| `bypass_cache=True`   | `cache_mode=CacheMode.BYPASS`  |
+| `disable_cache=True`  | `cache_mode=CacheMode.DISABLED`|
+| `no_cache_read=True`  | `cache_mode=CacheMode.WRITE_ONLY` |
+| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
+```
+
+
+## File: docs/md_v2/core/cli.md
+
+```md
+# Crawl4AI CLI Guide
+
+## Table of Contents
+- [Installation](#installation)
+- [Basic Usage](#basic-usage)
+- [Configuration](#configuration)
+  - [Browser Configuration](#browser-configuration)
+  - [Crawler Configuration](#crawler-configuration)
+  - [Extraction Configuration](#extraction-configuration)
+  - [Content Filtering](#content-filtering)
+- [Advanced Features](#advanced-features)
+  - [LLM Q&A](#llm-qa)
+  - [Structured Data Extraction](#structured-data-extraction)
+  - [Content Filtering](#content-filtering-1)
+- [Output Formats](#output-formats)
+- [Examples](#examples)
+- [Configuration Reference](#configuration-reference)
+- [Best Practices & Tips](#best-practices--tips)
+
+## Basic Usage
+
+The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
+
+```bash
+# Basic crawling
+crwl https://example.com
+
+# Get markdown output
+crwl https://example.com -o markdown
+
+# Verbose JSON output with cache bypass
+crwl https://example.com -o json -v --bypass-cache
+
+# See usage examples
+crwl --example
+```
+
+## Quick Example of Advanced Usage
+
+If you clone the repository and run the following command, you will receive the content of the page in JSON format according to a JSON-CSS schema:
+
+```bash
+crwl "https://www.infoq.com/ai-ml-data-eng/" -e docs/examples/cli/extract_css.yml -s docs/examples/cli/css_schema.json -o json;
+```
+
+## Configuration
+
+### Browser Configuration
+
+Browser settings can be configured via YAML file or command line parameters:
+
+```yaml
+# browser.yml
+headless: true
+viewport_width: 1280
+user_agent_mode: "random"
+verbose: true
+ignore_https_errors: true
+```
+
+```bash
+# Using config file
+crwl https://example.com -B browser.yml
+
+# Using direct parameters
+crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
+```
+
+### Crawler Configuration
+
+Control crawling behavior:
+
+```yaml
+# crawler.yml
+cache_mode: "bypass"
+wait_until: "networkidle"
+page_timeout: 30000
+delay_before_return_html: 0.5
+word_count_threshold: 100
+scan_full_page: true
+scroll_delay: 0.3
+process_iframes: false
+remove_overlay_elements: true
+magic: true
+verbose: true
+```
+
+```bash
+# Using config file
+crwl https://example.com -C crawler.yml
+
+# Using direct parameters
+crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
+```
+
+### Extraction Configuration
+
+Two types of extraction are supported:
+
+1. CSS/XPath-based extraction:
+```yaml
+# extract_css.yml
+type: "json-css"
+params:
+  verbose: true
+```
+
+```json
+// css_schema.json
+{
+  "name": "ArticleExtractor",
+  "baseSelector": ".article",
+  "fields": [
+    {
+      "name": "title",
+      "selector": "h1.title",
+      "type": "text"
+    },
+    {
+      "name": "link",
+      "selector": "a.read-more",
+      "type": "attribute",
+      "attribute": "href"
+    }
+  ]
+}
+```
+
+2. LLM-based extraction:
+```yaml
+# extract_llm.yml
+type: "llm"
+provider: "openai/gpt-4"
+instruction: "Extract all articles with their titles and links"
+api_token: "your-token"
+params:
+  temperature: 0.3
+  max_tokens: 1000
+```
+
+```json
+// llm_schema.json
+{
+  "title": "Article",
+  "type": "object",
+  "properties": {
+    "title": {
+      "type": "string",
+      "description": "The title of the article"
+    },
+    "link": {
+      "type": "string",
+      "description": "URL to the full article"
+    }
+  }
+}
+```
+
+## Advanced Features
+
+### LLM Q&A
+
+Ask questions about crawled content:
+
+```bash
+# Simple question
+crwl https://example.com -q "What is the main topic discussed?"
+
+# View content then ask questions
+crwl https://example.com -o markdown  # See content first
+crwl https://example.com -q "Summarize the key points"
+crwl https://example.com -q "What are the conclusions?"
+
+# Combined with advanced crawling
+crwl https://example.com \
+    -B browser.yml \
+    -c "css_selector=article,scan_full_page=true" \
+    -q "What are the pros and cons mentioned?"
+```
+
+First-time setup:
+- Prompts for LLM provider and API token
+- Saves configuration in `~/.crawl4ai/global.yml`
+- Supports various providers (openai/gpt-4, anthropic/claude-3-sonnet, etc.)
+- For case of `ollama` you do not need to provide API token.
+- See [LiteLLM Providers](https://docs.litellm.ai/docs/providers) for full list
+
+### Structured Data Extraction
+
+Extract structured data using CSS selectors:
+
+```bash
+crwl https://example.com \
+    -e extract_css.yml \
+    -s css_schema.json \
+    -o json
+```
+
+Or using LLM-based extraction:
+
+```bash
+crwl https://example.com \
+    -e extract_llm.yml \
+    -s llm_schema.json \
+    -o json
+```
+
+### Content Filtering
+
+Filter content for relevance:
+
+```yaml
+# filter_bm25.yml
+type: "bm25"
+query: "target content"
+threshold: 1.0
+
+# filter_pruning.yml
+type: "pruning"
+query: "focus topic"
+threshold: 0.48
+```
+
+```bash
+crwl https://example.com -f filter_bm25.yml -o markdown-fit
+```
+
+## Output Formats
+
+- `all` - Full crawl result including metadata
+- `json` - Extracted structured data (when using extraction)
+- `markdown` / `md` - Raw markdown output
+- `markdown-fit` / `md-fit` - Filtered markdown for better readability
+
+## Complete Examples
+
+1. Basic Extraction:
+```bash
+crwl https://example.com \
+    -B browser.yml \
+    -C crawler.yml \
+    -o json
+```
+
+2. Structured Data Extraction:
+```bash
+crwl https://example.com \
+    -e extract_css.yml \
+    -s css_schema.json \
+    -o json \
+    -v
+```
+
+3. LLM Extraction with Filtering:
+```bash
+crwl https://example.com \
+    -B browser.yml \
+    -e extract_llm.yml \
+    -s llm_schema.json \
+    -f filter_bm25.yml \
+    -o json
+```
+
+4. Interactive Q&A:
+```bash
+# First crawl and view
+crwl https://example.com -o markdown
+
+# Then ask questions
+crwl https://example.com -q "What are the main points?"
+crwl https://example.com -q "Summarize the conclusions"
+```
+
+## Best Practices & Tips
+
+1. **Configuration Management**:
+   - Keep common configurations in YAML files
+   - Use CLI parameters for quick overrides
+   - Store sensitive data (API tokens) in `~/.crawl4ai/global.yml`
+
+2. **Performance Optimization**:
+   - Use `--bypass-cache` for fresh content
+   - Enable `scan_full_page` for infinite scroll pages
+   - Adjust `delay_before_return_html` for dynamic content
+
+3. **Content Extraction**:
+   - Use CSS extraction for structured content
+   - Use LLM extraction for unstructured content
+   - Combine with filters for focused results
+
+4. **Q&A Workflow**:
+   - View content first with `-o markdown`
+   - Ask specific questions
+   - Use broader context with appropriate selectors
+
+## Recap
+
+The Crawl4AI CLI provides:
+- Flexible configuration via files and parameters
+- Multiple extraction strategies (CSS, XPath, LLM)
+- Content filtering and optimization
+- Interactive Q&A capabilities
+- Various output formats
+
+
+```
+
+
+## File: docs/md_v2/core/content-selection.md
+
+```md
+# Content Selection
+
+Crawl4AI provides multiple ways to **select**, **filter**, and **refine** the content from your crawls. Whether you need to target a specific CSS region, exclude entire tags, filter out external links, or remove certain domains and images, **`CrawlerRunConfig`** offers a wide range of parameters.
+
+Below, we show how to configure these parameters and combine them for precise control.
+
+---
+
+## 1. CSS-Based Selection
+
+There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`.
+
+### 1.1 Using `css_selector`
+
+A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        # e.g., first 30 items from Hacker News
+        css_selector=".athing:nth-child(-n+30)"  
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com/newest", 
+            config=config
+        )
+        print("Partial HTML length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Result**: Only elements matching that selector remain in `result.cleaned_html`.
+
+### 1.2 Using `target_elements`
+
+The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        # Target article body and sidebar, but not other content
+        target_elements=["article.main-content", "aside.sidebar"]
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/blog-post", 
+            config=config
+        )
+        print("Markdown focused on target elements")
+        print("Links from entire page still available:", len(result.links.get("internal", [])))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection.
+
+---
+
+## 2. Content Filtering & Exclusions
+
+### 2.1 Basic Overview
+
+```python
+config = CrawlerRunConfig(
+    # Content thresholds
+    word_count_threshold=10,        # Minimum words per block
+
+    # Tag exclusions
+    excluded_tags=['form', 'header', 'footer', 'nav'],
+
+    # Link filtering
+    exclude_external_links=True,    
+    exclude_social_media_links=True,
+    # Block entire domains
+    exclude_domains=["adtrackers.com", "spammynews.org"],    
+    exclude_social_media_domains=["facebook.com", "twitter.com"],
+
+    # Media filtering
+    exclude_external_images=True
+)
+```
+
+**Explanation**:
+
+- **`word_count_threshold`**: Ignores text blocks under X words. Helps skip trivial blocks like short nav or disclaimers.  
+- **`excluded_tags`**: Removes entire tags (`<form>`, `<header>`, `<footer>`, etc.).  
+- **Link Filtering**:  
+  - `exclude_external_links`: Strips out external links and may remove them from `result.links`.  
+  - `exclude_social_media_links`: Removes links pointing to known social media domains.  
+  - `exclude_domains`: A custom list of domains to block if discovered in links.  
+  - `exclude_social_media_domains`: A curated list (override or add to it) for social media sites.  
+- **Media Filtering**:  
+  - `exclude_external_images`: Discards images not hosted on the same domain as the main page (or its subdomains).
+
+By default in case you set `exclude_social_media_links=True`, the following social media domains are excluded:
+```python
+[
+    'facebook.com',
+    'twitter.com',
+    'x.com',
+    'linkedin.com',
+    'instagram.com',
+    'pinterest.com',
+    'tiktok.com',
+    'snapchat.com',
+    'reddit.com',
+]
+```
+
+
+### 2.2 Example Usage
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    config = CrawlerRunConfig(
+        css_selector="main.content", 
+        word_count_threshold=10,
+        excluded_tags=["nav", "footer"],
+        exclude_external_links=True,
+        exclude_social_media_links=True,
+        exclude_domains=["ads.com", "spammytrackers.net"],
+        exclude_external_images=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://news.ycombinator.com", config=config)
+        print("Cleaned HTML length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Note**: If these parameters remove too much, reduce or disable them accordingly.
+
+---
+
+## 3. Handling Iframes
+
+Some sites embed content in `<iframe>` tags. If you want that inline:
+```python
+config = CrawlerRunConfig(
+    # Merge iframe content into the final output
+    process_iframes=True,    
+    remove_overlay_elements=True
+)
+```
+
+**Usage**:
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        process_iframes=True,
+        remove_overlay_elements=True
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.org/iframe-demo", 
+            config=config
+        )
+        print("Iframe-merged length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 4. Structured Extraction Examples
+
+You can combine content selection with a more advanced extraction strategy. For instance, a **CSS-based** or **LLM-based** extraction strategy can run on the filtered HTML.
+
+### 4.1 Pattern-Based with `JsonCssExtractionStrategy`
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    # Minimal schema for repeated items
+    schema = {
+        "name": "News Items",
+        "baseSelector": "tr.athing",
+        "fields": [
+            {"name": "title", "selector": "span.titleline a", "type": "text"},
+            {
+                "name": "link", 
+                "selector": "span.titleline a", 
+                "type": "attribute", 
+                "attribute": "href"
+            }
+        ]
+    }
+
+    config = CrawlerRunConfig(
+        # Content filtering
+        excluded_tags=["form", "header"],
+        exclude_domains=["adsite.com"],
+        
+        # CSS selection or entire page
+        css_selector="table.itemlist",
+
+        # No caching for demonstration
+        cache_mode=CacheMode.BYPASS,
+
+        # Extraction strategy
+        extraction_strategy=JsonCssExtractionStrategy(schema)
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com/newest", 
+            config=config
+        )
+        data = json.loads(result.extracted_content)
+        print("Sample extracted item:", data[:1])  # Show first item
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 4.2 LLM-Based Extraction
+
+```python
+import asyncio
+import json
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class ArticleData(BaseModel):
+    headline: str
+    summary: str
+
+async def main():
+    llm_strategy = LLMExtractionStrategy(
+        llm_config = LLMConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
+        schema=ArticleData.schema(),
+        extraction_type="schema",
+        instruction="Extract 'headline' and a short 'summary' from the content."
+    )
+
+    config = CrawlerRunConfig(
+        exclude_external_links=True,
+        word_count_threshold=20,
+        extraction_strategy=llm_strategy
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://news.ycombinator.com", config=config)
+        article = json.loads(result.extracted_content)
+        print(article)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+Here, the crawler:
+
+- Filters out external links (`exclude_external_links=True`).  
+- Ignores very short text blocks (`word_count_threshold=20`).  
+- Passes the final HTML to your LLM strategy for an AI-driven parse.
+
+---
+
+## 5. Comprehensive Example
+
+Below is a short function that unifies **CSS selection**, **exclusion** logic, and a pattern-based extraction, demonstrating how you can fine-tune your final data:
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def extract_main_articles(url: str):
+    schema = {
+        "name": "ArticleBlock",
+        "baseSelector": "div.article-block",
+        "fields": [
+            {"name": "headline", "selector": "h2", "type": "text"},
+            {"name": "summary", "selector": ".summary", "type": "text"},
+            {
+                "name": "metadata",
+                "type": "nested",
+                "fields": [
+                    {"name": "author", "selector": ".author", "type": "text"},
+                    {"name": "date", "selector": ".date", "type": "text"}
+                ]
+            }
+        ]
+    }
+
+    config = CrawlerRunConfig(
+        # Keep only #main-content
+        css_selector="#main-content",
+        
+        # Filtering
+        word_count_threshold=10,
+        excluded_tags=["nav", "footer"],  
+        exclude_external_links=True,
+        exclude_domains=["somebadsite.com"],
+        exclude_external_images=True,
+
+        # Extraction
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=url, config=config)
+        if not result.success:
+            print(f"Error: {result.error_message}")
+            return None
+        return json.loads(result.extracted_content)
+
+async def main():
+    articles = await extract_main_articles("https://news.ycombinator.com/newest")
+    if articles:
+        print("Extracted Articles:", articles[:2])  # Show first 2
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Why This Works**:
+- **CSS** scoping with `#main-content`.  
+- Multiple **exclude_** parameters to remove domains, external images, etc.  
+- A **JsonCssExtractionStrategy** to parse repeated article blocks.
+
+---
+
+## 6. Scraping Modes
+
+Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
+
+async def main():
+    config = CrawlerRunConfig(
+        scraping_strategy=LXMLWebScrapingStrategy()  # Faster alternative to default BeautifulSoup
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com", 
+            config=config
+        )
+```
+
+You can also create your own custom scraping strategy by inheriting from `ContentScrapingStrategy`. The strategy must return a `ScrapingResult` object with the following structure:
+
+```python
+from crawl4ai import ContentScrapingStrategy, ScrapingResult, MediaItem, Media, Link, Links
+
+class CustomScrapingStrategy(ContentScrapingStrategy):
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # Implement your custom scraping logic here
+        return ScrapingResult(
+            cleaned_html="<html>...</html>",  # Cleaned HTML content
+            success=True,                     # Whether scraping was successful
+            media=Media(
+                images=[                      # List of images found
+                    MediaItem(
+                        src="https://example.com/image.jpg",
+                        alt="Image description",
+                        desc="Surrounding text",
+                        score=1,
+                        type="image",
+                        group_id=1,
+                        format="jpg",
+                        width=800
+                    )
+                ],
+                videos=[],                    # List of videos (same structure as images)
+                audios=[]                     # List of audio files (same structure as images)
+            ),
+            links=Links(
+                internal=[                    # List of internal links
+                    Link(
+                        href="https://example.com/page",
+                        text="Link text",
+                        title="Link title",
+                        base_domain="example.com"
+                    )
+                ],
+                external=[]                   # List of external links (same structure)
+            ),
+            metadata={                        # Additional metadata
+                "title": "Page Title",
+                "description": "Page description"
+            }
+        )
+
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # For simple cases, you can use the sync version
+        return await asyncio.to_thread(self.scrap, url, html, **kwargs)
+```
+
+### Performance Considerations
+
+The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
+
+1. LXML strategy is currently experimental
+2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
+3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
+
+Choose LXML strategy when:
+- Processing large HTML documents (recommended for >100KB)
+- Performance is critical
+- Working with well-formed HTML
+
+Stick to BeautifulSoup strategy (default) when:
+- Maximum compatibility is needed
+- Working with malformed HTML
+- Exact parsing behavior is critical
+
+---
+
+## 7. Combining CSS Selection Methods
+
+You can combine `css_selector` and `target_elements` in powerful ways to achieve fine-grained control over your output:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    # Target specific content but preserve page context
+    config = CrawlerRunConfig(
+        # Focus markdown on main content and sidebar
+        target_elements=["#main-content", ".sidebar"],
+        
+        # Global filters applied to entire page
+        excluded_tags=["nav", "footer", "header"],
+        exclude_external_links=True,
+        
+        # Use basic content thresholds
+        word_count_threshold=15,
+        
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/article",
+            config=config
+        )
+        
+        print(f"Content focuses on specific elements, but all links still analyzed")
+        print(f"Internal links: {len(result.links.get('internal', []))}")
+        print(f"External links: {len(result.links.get('external', []))}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+This approach gives you the best of both worlds:
+- Markdown generation and content extraction focus on the elements you care about
+- Links, images and other page data still give you the full context of the page
+- Content filtering still applies globally
+
+## 8. Conclusion
+
+By mixing **target_elements** or **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
+
+1. **`target_elements`** – Array of CSS selectors to focus markdown generation and data extraction, while preserving full page context for links and media.
+2. **`css_selector`** – Basic scoping to an element or region for all extraction processes.  
+3. **`word_count_threshold`** – Skip short blocks.  
+4. **`excluded_tags`** – Remove entire HTML tags.  
+5. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains.  
+6. **`exclude_external_images`** – Remove images from external sources.  
+7. **`process_iframes`** – Merge iframe content if needed.  
+
+Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max!
+```
+
+
+## File: docs/md_v2/core/crawler-result.md
+
+```md
+# Crawl Result and Output
+
+When you call `arun()` on a page, Crawl4AI returns a **`CrawlResult`** object containing everything you might need—raw HTML, a cleaned version, optional screenshots or PDFs, structured extraction results, and more. This document explains those fields and how they map to different output types.  
+
+---
+
+## 1. The `CrawlResult` Model
+
+Below is the core schema. Each field captures a different aspect of the crawl’s result:
+
+```python
+class MarkdownGenerationResult(BaseModel):
+    raw_markdown: str
+    markdown_with_citations: str
+    references_markdown: str
+    fit_markdown: Optional[str] = None
+    fit_html: Optional[str] = None
+
+class CrawlResult(BaseModel):
+    url: str
+    html: str
+    success: bool
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {}
+    links: Dict[str, List[Dict]] = {}
+    downloaded_files: Optional[List[str]] = None
+    screenshot: Optional[str] = None
+    pdf : Optional[bytes] = None
+    mhtml: Optional[str] = None
+    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
+    session_id: Optional[str] = None
+    response_headers: Optional[dict] = None
+    status_code: Optional[int] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    class Config:
+        arbitrary_types_allowed = True
+```
+
+### Table: Key Fields in `CrawlResult`
+
+| Field (Name & Type)                       | Description                                                                                         |
+|-------------------------------------------|-----------------------------------------------------------------------------------------------------|
+| **url (`str`)**                           | The final or actual URL crawled (in case of redirects).                                             |
+| **html (`str`)**                          | Original, unmodified page HTML. Good for debugging or custom processing.                            |
+| **success (`bool`)**                      | `True` if the crawl completed without major errors, else `False`.                                   |
+| **cleaned_html (`Optional[str]`)**        | Sanitized HTML with scripts/styles removed; can exclude tags if configured via `excluded_tags` etc. |
+| **media (`Dict[str, List[Dict]]`)**       | Extracted media info (images, audio, etc.), each with attributes like `src`, `alt`, `score`, etc.   |
+| **links (`Dict[str, List[Dict]]`)**       | Extracted link data, split by `internal` and `external`. Each link usually has `href`, `text`, etc. |
+| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads.         |
+| **screenshot (`Optional[str]`)**          | Screenshot of the page (base64-encoded) if `screenshot=True`.                                       |
+| **pdf (`Optional[bytes]`)**               | PDF of the page if `pdf=True`.                                                                      |
+| **mhtml (`Optional[str]`)**               | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources.      |
+| **markdown (`Optional[str or MarkdownGenerationResult]`)** | It holds a `MarkdownGenerationResult`. Over time, this will be consolidated into `markdown`. The generator can provide raw markdown, citations, references, and optionally `fit_markdown`. |
+| **extracted_content (`Optional[str]`)**   | The output of a structured extraction (CSS/LLM-based) stored as JSON string or other text.          |
+| **metadata (`Optional[dict]`)**           | Additional info about the crawl or extracted data.                                                  |
+| **error_message (`Optional[str]`)**       | If `success=False`, contains a short description of what went wrong.                                |
+| **session_id (`Optional[str]`)**          | The ID of the session used for multi-page or persistent crawling.                                   |
+| **response_headers (`Optional[dict]`)**   | HTTP response headers, if captured.                                                                 |
+| **status_code (`Optional[int]`)**         | HTTP status code (e.g., 200 for OK).                                                                |
+| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`.                                               |
+
+---
+
+## 2. HTML Variants
+
+### `html`: Raw HTML
+
+Crawl4AI preserves the exact HTML as `result.html`. Useful for:
+
+- Debugging page issues or checking the original content.
+- Performing your own specialized parse if needed.
+
+### `cleaned_html`: Sanitized
+
+If you specify any cleanup or exclusion parameters in `CrawlerRunConfig` (like `excluded_tags`, `remove_forms`, etc.), you’ll see the result here:
+
+```python
+config = CrawlerRunConfig(
+    excluded_tags=["form", "header", "footer"],
+    keep_data_attributes=False
+)
+result = await crawler.arun("https://example.com", config=config)
+print(result.cleaned_html)  # Freed of forms, header, footer, data-* attributes
+```
+
+---
+
+## 3. Markdown Generation
+
+### 3.1 `markdown`
+
+- **`markdown`**: The current location for detailed markdown output, returning a **`MarkdownGenerationResult`** object.  
+- **`markdown_v2`**: Deprecated since v0.5.
+
+**`MarkdownGenerationResult`** Fields:
+
+| Field                   | Description                                                                    |
+|-------------------------|--------------------------------------------------------------------------------|
+| **raw_markdown**        | The basic HTML→Markdown conversion.                                            |
+| **markdown_with_citations** | Markdown including inline citations that reference links at the end.        |
+| **references_markdown** | The references/citations themselves (if `citations=True`).                      |
+| **fit_markdown**        | The filtered/“fit” markdown if a content filter was used.                       |
+| **fit_html**            | The filtered HTML that generated `fit_markdown`.                                |
+
+### 3.2 Basic Example with a Markdown Generator
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+config = CrawlerRunConfig(
+    markdown_generator=DefaultMarkdownGenerator(
+        options={"citations": True, "body_width": 80}  # e.g. pass html2text style options
+    )
+)
+result = await crawler.arun(url="https://example.com", config=config)
+
+md_res = result.markdown  # or eventually 'result.markdown'
+print(md_res.raw_markdown[:500])
+print(md_res.markdown_with_citations)
+print(md_res.references_markdown)
+```
+
+**Note**: If you use a filter like `PruningContentFilter`, you’ll get `fit_markdown` and `fit_html` as well.
+
+---
+
+## 4. Structured Extraction: `extracted_content`
+
+If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structured data is **not** stored in `markdown`—it’s placed in **`result.extracted_content`** as a JSON string (or sometimes plain text).
+
+### Example: CSS Extraction with `raw://` HTML
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    schema = {
+        "name": "Example Items",
+        "baseSelector": "div.item",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+        ]
+    }
+    raw_html = "<div class='item'><h2>Item 1</h2><a href='https://example.com/item1'>Link 1</a></div>"
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="raw://" + raw_html,
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                extraction_strategy=JsonCssExtractionStrategy(schema)
+            )
+        )
+        data = json.loads(result.extracted_content)
+        print(data)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+Here:
+- `url="raw://..."` passes the HTML content directly, no network requests.  
+- The **CSS** extraction strategy populates `result.extracted_content` with the JSON array `[{"title": "...", "link": "..."}]`.
+
+---
+
+## 5. More Fields: Links, Media, and More
+
+### 5.1 `links`
+
+A dictionary, typically with `"internal"` and `"external"` lists. Each entry might have `href`, `text`, `title`, etc. This is automatically captured if you haven’t disabled link extraction.
+
+```python
+print(result.links["internal"][:3])  # Show first 3 internal links
+```
+
+### 5.2 `media`
+
+Similarly, a dictionary with `"images"`, `"audio"`, `"video"`, etc. Each item could include `src`, `alt`, `score`, and more, if your crawler is set to gather them.
+
+```python
+images = result.media.get("images", [])
+for img in images:
+    print("Image URL:", img["src"], "Alt:", img.get("alt"))
+```
+
+### 5.3 `screenshot`, `pdf`, and `mhtml`
+
+If you set `screenshot=True`, `pdf=True`, or `capture_mhtml=True` in **`CrawlerRunConfig`**, then:
+
+- `result.screenshot` contains a base64-encoded PNG string.
+- `result.pdf` contains raw PDF bytes (you can write them to a file).
+- `result.mhtml` contains the MHTML snapshot of the page as a string (you can write it to a .mhtml file).
+
+```python
+# Save the PDF
+with open("page.pdf", "wb") as f:
+    f.write(result.pdf)
+
+# Save the MHTML
+if result.mhtml:
+    with open("page.mhtml", "w", encoding="utf-8") as f:
+        f.write(result.mhtml)
+```
+
+The MHTML (MIME HTML) format is particularly useful as it captures the entire web page including all of its resources (CSS, images, scripts, etc.) in a single file, making it perfect for archiving or offline viewing.
+
+### 5.4 `ssl_certificate`
+
+If `fetch_ssl_certificate=True`, `result.ssl_certificate` holds details about the site’s SSL cert, such as issuer, validity dates, etc.
+
+---
+
+## 6. Accessing These Fields
+
+After you run:
+
+```python
+result = await crawler.arun(url="https://example.com", config=some_config)
+```
+
+Check any field:
+
+```python
+if result.success:
+    print(result.status_code, result.response_headers)
+    print("Links found:", len(result.links.get("internal", [])))
+    if result.markdown:
+        print("Markdown snippet:", result.markdown.raw_markdown[:200])
+    if result.extracted_content:
+        print("Structured JSON:", result.extracted_content)
+else:
+    print("Error:", result.error_message)
+```
+
+**Deprecation**: Since v0.5 `result.markdown_v2`, `result.fit_html`,`result.fit_markdown` are deprecated. Use `result.markdown` instead! It holds `MarkdownGenerationResult`, which includes `fit_html` and `fit_markdown`
+as it's properties.
+
+
+---
+
+## 7. Next Steps
+
+- **Markdown Generation**: Dive deeper into how to configure `DefaultMarkdownGenerator` and various filters.  
+- **Content Filtering**: Learn how to use `BM25ContentFilter` and `PruningContentFilter`.
+- **Session & Hooks**: If you want to manipulate the page or preserve state across multiple `arun()` calls, see the hooking or session docs.  
+- **LLM Extraction**: For complex or unstructured content requiring AI-driven parsing, check the LLM-based strategies doc.
+
+**Enjoy** exploring all that `CrawlResult` offers—whether you need raw HTML, sanitized output, markdown, or fully structured data, Crawl4AI has you covered!
+```
+
+
+## File: docs/md_v2/core/deep-crawling.md
+
+```md
+# Deep Crawling
+
+One of Crawl4AI's most powerful features is its ability to perform **configurable deep crawling** that can explore websites beyond a single page. With fine-tuned control over crawl depth, domain boundaries, and content filtering, Crawl4AI gives you the tools to extract precisely the content you need.
+
+In this tutorial, you'll learn:
+
+1. How to set up a **Basic Deep Crawler** with BFS strategy  
+2. Understanding the difference between **streamed and non-streamed** output  
+3. Implementing **filters and scorers** to target specific content  
+4. Creating **advanced filtering chains** for sophisticated crawls  
+5. Using **BestFirstCrawling** for intelligent exploration prioritization  
+
+> **Prerequisites**  
+> - You’ve completed or read [AsyncWebCrawler Basics](../core/simple-crawling.md) to understand how to run a simple crawl.  
+> - You know how to configure `CrawlerRunConfig`.
+
+---
+
+## 1. Quick Example
+
+Here's a minimal code snippet that implements a basic deep crawl using the **BFSDeepCrawlStrategy**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+
+async def main():
+    # Configure a 2-level deep crawl
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=2, 
+            include_external=False
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun("https://example.com", config=config)
+        
+        print(f"Crawled {len(results)} pages in total")
+        
+        # Access individual results
+        for result in results[:3]:  # Show first 3 results
+            print(f"URL: {result.url}")
+            print(f"Depth: {result.metadata.get('depth', 0)}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What's happening?**  
+- `BFSDeepCrawlStrategy(max_depth=2, include_external=False)` instructs Crawl4AI to:
+  - Crawl the starting page (depth 0) plus 2 more levels
+  - Stay within the same domain (don't follow external links)
+- Each result contains metadata like the crawl depth
+- Results are returned as a list after all crawling is complete
+
+---
+
+## 2. Understanding Deep Crawling Strategy Options
+
+### 2.1 BFSDeepCrawlStrategy (Breadth-First Search)
+
+The **BFSDeepCrawlStrategy** uses a breadth-first approach, exploring all links at one depth before moving deeper:
+
+```python
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+
+# Basic configuration
+strategy = BFSDeepCrawlStrategy(
+    max_depth=2,               # Crawl initial page + 2 levels deep
+    include_external=False,    # Stay within the same domain
+    max_pages=50,              # Maximum number of pages to crawl (optional)
+    score_threshold=0.3,       # Minimum score for URLs to be crawled (optional)
+)
+```
+
+**Key parameters:**
+- **`max_depth`**: Number of levels to crawl beyond the starting page
+- **`include_external`**: Whether to follow links to other domains
+- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
+- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
+- **`filter_chain`**: FilterChain instance for URL filtering
+- **`url_scorer`**: Scorer instance for evaluating URLs
+
+### 2.2 DFSDeepCrawlStrategy (Depth-First Search)
+
+The **DFSDeepCrawlStrategy** uses a depth-first approach, explores as far down a branch as possible before backtracking.
+
+```python
+from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
+
+# Basic configuration
+strategy = DFSDeepCrawlStrategy(
+    max_depth=2,               # Crawl initial page + 2 levels deep
+    include_external=False,    # Stay within the same domain
+    max_pages=30,              # Maximum number of pages to crawl (optional)
+    score_threshold=0.5,       # Minimum score for URLs to be crawled (optional)
+)
+```
+
+**Key parameters:**
+- **`max_depth`**: Number of levels to crawl beyond the starting page
+- **`include_external`**: Whether to follow links to other domains
+- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
+- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
+- **`filter_chain`**: FilterChain instance for URL filtering
+- **`url_scorer`**: Scorer instance for evaluating URLs
+
+### 2.3 BestFirstCrawlingStrategy (⭐️ - Recommended Deep crawl strategy)
+
+For more intelligent crawling, use **BestFirstCrawlingStrategy** with scorers to prioritize the most relevant pages:
+
+```python
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+
+# Create a scorer
+scorer = KeywordRelevanceScorer(
+    keywords=["crawl", "example", "async", "configuration"],
+    weight=0.7
+)
+
+# Configure the strategy
+strategy = BestFirstCrawlingStrategy(
+    max_depth=2,
+    include_external=False,
+    url_scorer=scorer,
+    max_pages=25,              # Maximum number of pages to crawl (optional)
+)
+```
+
+This crawling approach:
+- Evaluates each discovered URL based on scorer criteria
+- Visits higher-scoring pages first
+- Helps focus crawl resources on the most relevant content
+- Can limit total pages crawled with `max_pages`
+- Does not need `score_threshold` as it naturally prioritizes by score
+
+---
+
+## 3. Streaming vs. Non-Streaming Results
+
+Crawl4AI can return results in two modes:
+
+### 3.1 Non-Streaming Mode (Default)
+
+```python
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
+    stream=False  # Default behavior
+)
+
+async with AsyncWebCrawler() as crawler:
+    # Wait for ALL results to be collected before returning
+    results = await crawler.arun("https://example.com", config=config)
+    
+    for result in results:
+        process_result(result)
+```
+
+**When to use non-streaming mode:**
+- You need the complete dataset before processing
+- You're performing batch operations on all results together
+- Crawl time isn't a critical factor
+
+### 3.2 Streaming Mode
+
+```python
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
+    stream=True  # Enable streaming
+)
+
+async with AsyncWebCrawler() as crawler:
+    # Returns an async iterator
+    async for result in await crawler.arun("https://example.com", config=config):
+        # Process each result as it becomes available
+        process_result(result)
+```
+
+**Benefits of streaming mode:**
+- Process results immediately as they're discovered
+- Start working with early results while crawling continues
+- Better for real-time applications or progressive display
+- Reduces memory pressure when handling many pages
+
+---
+
+## 4. Filtering Content with Filter Chains
+
+Filters help you narrow down which pages to crawl. Combine multiple filters using **FilterChain** for powerful targeting.
+
+### 4.1 Basic URL Pattern Filter
+
+```python
+from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
+
+# Only follow URLs containing "blog" or "docs"
+url_filter = URLPatternFilter(patterns=["*blog*", "*docs*"])
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([url_filter])
+    )
+)
+```
+
+### 4.2 Combining Multiple Filters
+
+```python
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter
+)
+
+# Create a chain of filters
+filter_chain = FilterChain([
+    # Only follow URLs with specific patterns
+    URLPatternFilter(patterns=["*guide*", "*tutorial*"]),
+    
+    # Only crawl specific domains
+    DomainFilter(
+        allowed_domains=["docs.example.com"],
+        blocked_domains=["old.docs.example.com"]
+    ),
+    
+    # Only include specific content types
+    ContentTypeFilter(allowed_types=["text/html"])
+])
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=2,
+        filter_chain=filter_chain
+    )
+)
+```
+
+### 4.3 Available Filter Types
+
+Crawl4AI includes several specialized filters:
+
+- **`URLPatternFilter`**: Matches URL patterns using wildcard syntax
+- **`DomainFilter`**: Controls which domains to include or exclude
+- **`ContentTypeFilter`**: Filters based on HTTP Content-Type
+- **`ContentRelevanceFilter`**: Uses similarity to a text query
+- **`SEOFilter`**: Evaluates SEO elements (meta tags, headers, etc.)
+
+---
+
+## 5. Using Scorers for Prioritized Crawling
+
+Scorers assign priority values to discovered URLs, helping the crawler focus on the most relevant content first.
+
+### 5.1 KeywordRelevanceScorer
+
+```python
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+
+# Create a keyword relevance scorer
+keyword_scorer = KeywordRelevanceScorer(
+    keywords=["crawl", "example", "async", "configuration"],
+    weight=0.7  # Importance of this scorer (0.0 to 1.0)
+)
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BestFirstCrawlingStrategy(
+        max_depth=2,
+        url_scorer=keyword_scorer
+    ),
+    stream=True  # Recommended with BestFirstCrawling
+)
+
+# Results will come in order of relevance score
+async with AsyncWebCrawler() as crawler:
+    async for result in await crawler.arun("https://example.com", config=config):
+        score = result.metadata.get("score", 0)
+        print(f"Score: {score:.2f} | {result.url}")
+```
+
+**How scorers work:**
+- Evaluate each discovered URL before crawling
+- Calculate relevance based on various signals
+- Help the crawler make intelligent choices about traversal order
+
+---
+
+## 6. Advanced Filtering Techniques
+
+### 6.1 SEO Filter for Quality Assessment
+
+The **SEOFilter** helps you identify pages with strong SEO characteristics:
+
+```python
+from crawl4ai.deep_crawling.filters import FilterChain, SEOFilter
+
+# Create an SEO filter that looks for specific keywords in page metadata
+seo_filter = SEOFilter(
+    threshold=0.5,  # Minimum score (0.0 to 1.0)
+    keywords=["tutorial", "guide", "documentation"]
+)
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([seo_filter])
+    )
+)
+```
+
+### 6.2 Content Relevance Filter
+
+The **ContentRelevanceFilter** analyzes the actual content of pages:
+
+```python
+from crawl4ai.deep_crawling.filters import FilterChain, ContentRelevanceFilter
+
+# Create a content relevance filter
+relevance_filter = ContentRelevanceFilter(
+    query="Web crawling and data extraction with Python",
+    threshold=0.7  # Minimum similarity score (0.0 to 1.0)
+)
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([relevance_filter])
+    )
+)
+```
+
+This filter:
+- Measures semantic similarity between query and page content
+- It's a BM25-based relevance filter using head section content
+
+---
+
+## 7. Building a Complete Advanced Crawler
+
+This example combines multiple techniques for a sophisticated crawl:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    DomainFilter,
+    URLPatternFilter,
+    ContentTypeFilter
+)
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+
+async def run_advanced_crawler():
+    # Create a sophisticated filter chain
+    filter_chain = FilterChain([
+        # Domain boundaries
+        DomainFilter(
+            allowed_domains=["docs.example.com"],
+            blocked_domains=["old.docs.example.com"]
+        ),
+        
+        # URL patterns to include
+        URLPatternFilter(patterns=["*guide*", "*tutorial*", "*blog*"]),
+        
+        # Content type filtering
+        ContentTypeFilter(allowed_types=["text/html"])
+    ])
+
+    # Create a relevance scorer
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"],
+        weight=0.7
+    )
+
+    # Set up the configuration
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=2,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True
+    )
+
+    # Execute the crawl
+    results = []
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun("https://docs.example.com", config=config):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")
+
+    # Analyze the results
+    print(f"Crawled {len(results)} high-value pages")
+    print(f"Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}")
+
+    # Group by depth
+    depth_counts = {}
+    for result in results:
+        depth = result.metadata.get("depth", 0)
+        depth_counts[depth] = depth_counts.get(depth, 0) + 1
+
+    print("Pages crawled by depth:")
+    for depth, count in sorted(depth_counts.items()):
+        print(f"  Depth {depth}: {count} pages")
+
+if __name__ == "__main__":
+    asyncio.run(run_advanced_crawler())
+```
+
+---
+
+
+## 8. Limiting and Controlling Crawl Size
+
+### 8.1 Using max_pages
+
+You can limit the total number of pages crawled with the `max_pages` parameter:
+
+```python
+# Limit to exactly 20 pages regardless of depth
+strategy = BFSDeepCrawlStrategy(
+    max_depth=3,
+    max_pages=20
+)
+```
+
+This feature is useful for:
+- Controlling API costs
+- Setting predictable execution times
+- Focusing on the most important content
+- Testing crawl configurations before full execution
+
+### 8.2 Using score_threshold
+
+For BFS and DFS strategies, you can set a minimum score threshold to only crawl high-quality pages:
+
+```python
+# Only follow links with scores above 0.4
+strategy = DFSDeepCrawlStrategy(
+    max_depth=2,
+    url_scorer=KeywordRelevanceScorer(keywords=["api", "guide", "reference"]),
+    score_threshold=0.4  # Skip URLs with scores below this value
+)
+```
+
+Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pages are already processed in order of highest score first.
+
+## 9. Common Pitfalls & Tips
+
+1.**Set realistic limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. Use `max_pages` to set hard limits.
+
+2.**Don't neglect the scoring component.** BestFirstCrawling works best with well-tuned scorers. Experiment with keyword weights for optimal prioritization.
+
+3.**Be a good web citizen.**  Respect robots.txt. (disabled by default)
+  
+4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.status` when processing results.
+
+5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
+
+---
+
+## 10. Summary & Next Steps
+
+In this **Deep Crawling with Crawl4AI** tutorial, you learned to:
+
+- Configure **BFSDeepCrawlStrategy**, **DFSDeepCrawlStrategy**, and **BestFirstCrawlingStrategy**
+- Process results in streaming or non-streaming mode
+- Apply filters to target specific content
+- Use scorers to prioritize the most relevant pages
+- Limit crawls with `max_pages` and `score_threshold` parameters
+- Build a complete advanced crawler with combined techniques
+
+With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case.
+
+```
+
+
+## File: docs/md_v2/core/docker-deployment.md
+
+```md
+# Crawl4AI Docker Guide 🐳
+
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+  - [Local Build](#local-build)
+  - [Docker Hub](#docker-hub)
+- [Dockerfile Parameters](#dockerfile-parameters)
+- [Using the API](#using-the-api)
+  - [Understanding Request Schema](#understanding-request-schema)
+  - [REST API Examples](#rest-api-examples)
+  - [Python SDK](#python-sdk)
+- [Metrics & Monitoring](#metrics--monitoring)
+- [Deployment Scenarios](#deployment-scenarios)
+- [Complete Examples](#complete-examples)
+- [Getting Help](#getting-help)
+
+## Prerequisites
+
+Before we dive in, make sure you have:
+- Docker installed and running (version 20.10.0 or higher)
+- At least 4GB of RAM available for the container
+- Python 3.10+ (if using the Python SDK)
+- Node.js 16+ (if using the Node.js examples)
+
+> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
+
+## Installation
+
+### Local Build
+
+Let's get your local environment set up step by step!
+
+#### 1. Building the Image
+
+First, clone the repository and build the Docker image:
+
+```bash
+# Clone the repository
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai/deploy
+
+# Build the Docker image
+docker build --platform=linux/amd64 --no-cache -t crawl4ai .
+
+# Or build for arm64
+docker build --platform=linux/arm64 --no-cache -t crawl4ai .
+```
+
+#### 2. Environment Setup
+
+If you plan to use LLMs (Language Models), you'll need to set up your API keys. Create a `.llm.env` file:
+
+```env
+# OpenAI
+OPENAI_API_KEY=sk-your-key
+
+# Anthropic
+ANTHROPIC_API_KEY=your-anthropic-key
+
+# DeepSeek
+DEEPSEEK_API_KEY=your-deepseek-key
+
+# Check out https://docs.litellm.ai/docs/providers for more providers!
+```
+
+> 🔑 **Note**: Keep your API keys secure! Never commit them to version control.
+
+#### 3. Running the Container
+
+You have several options for running the container:
+
+Basic run (no LLM support):
+```bash
+docker run -d -p 8000:8000 --name crawl4ai crawl4ai
+```
+
+With LLM support:
+```bash
+docker run -d -p 8000:8000 \
+  --env-file .llm.env \
+  --name crawl4ai \
+  crawl4ai
+```
+
+Using host environment variables (Not a good practice, but works for local testing):
+```bash
+docker run -d -p 8000:8000 \
+  --env-file .llm.env \
+  --env "$(env)" \
+  --name crawl4ai \
+  crawl4ai
+```
+
+#### Multi-Platform Build
+For distributing your image across different architectures, use `buildx`:
+
+```bash
+# Set up buildx builder
+docker buildx create --use
+
+# Build for multiple platforms
+docker buildx build \
+  --platform linux/amd64,linux/arm64 \
+  -t crawl4ai \
+  --push \
+  .
+```
+
+> 💡 **Note**: Multi-platform builds require Docker Buildx and need to be pushed to a registry.
+
+#### Development Build
+For development, you might want to enable all features:
+
+```bash
+docker build -t crawl4ai
+  --build-arg INSTALL_TYPE=all \
+  --build-arg PYTHON_VERSION=3.10 \
+  --build-arg ENABLE_GPU=true \
+  .
+```
+
+#### GPU-Enabled Build
+If you plan to use GPU acceleration:
+
+```bash
+docker build -t crawl4ai
+  --build-arg ENABLE_GPU=true \
+  deploy/docker/
+```
+
+### Build Arguments Explained
+
+| Argument | Description | Default | Options |
+|----------|-------------|---------|----------|
+| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 |
+| INSTALL_TYPE | Feature set | default | default, all, torch, transformer |
+| ENABLE_GPU | GPU support | false | true, false |
+| APP_HOME | Install path | /app | any valid path |
+
+### Build Best Practices
+
+1. **Choose the Right Install Type**
+   - `default`: Basic installation, smallest image, to be honest, I use this most of the time.
+   - `all`: Full features, larger image (include transformer, and nltk, make sure you really need them)
+
+2. **Platform Considerations**
+   - Let Docker auto-detect platform unless you need cross-compilation
+   - Use --platform for specific architecture requirements
+   - Consider buildx for multi-architecture distribution
+
+3. **Performance Optimization**
+   - The image automatically includes platform-specific optimizations
+   - AMD64 gets OpenMP optimizations
+   - ARM64 gets OpenBLAS optimizations
+
+### Docker Hub
+
+> 🚧 Coming soon! The image will be available at `crawl4ai`. Stay tuned!
+
+## Using the API
+
+In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail.
+
+### Python SDK
+
+The SDK makes things easier! Here's how to use it:
+
+```python
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import BrowserConfig, CrawlerRunConfig
+
+async def main():
+    async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
+      # If JWT is enabled, you can authenticate like this: (more on this later)
+        # await client.authenticate("test@example.com")
+        
+        # Non-streaming crawl
+        results = await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=CrawlerRunConfig()
+        )
+        print(f"Non-streaming results: {results}")
+        
+        # Streaming crawl
+        crawler_config = CrawlerRunConfig(stream=True)
+        async for result in await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=crawler_config
+        ):
+            print(f"Streamed result: {result}")
+        
+        # Get schema
+        schema = await client.get_schema()
+        print(f"Schema: {schema}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control:
+
+- `base_url` (str): Base URL of the Crawl4AI Docker server
+- `timeout` (float): Default timeout for requests in seconds
+- `verify_ssl` (bool): Whether to verify SSL certificates
+- `verbose` (bool): Whether to show logging output
+- `log_file` (str, optional): Path to log file if file logging is desired
+
+This client SDK generates a properly structured JSON request for the server's HTTP API.
+
+## Second Approach: Direct API Calls
+
+This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works.
+
+### Understanding Configuration Structure
+
+Let's dive deep into how configurations work in Crawl4AI. Every configuration object follows a consistent pattern of `type` and `params`. This structure enables complex, nested configurations while maintaining clarity.
+
+#### The Basic Pattern
+
+Try this in Python to understand the structure:
+```python
+from crawl4ai import BrowserConfig
+
+# Create a config and see its structure
+config = BrowserConfig(headless=True)
+print(config.dump())
+```
+
+This outputs:
+```json
+{
+    "type": "BrowserConfig",
+    "params": {
+        "headless": true
+    }
+}
+```
+
+#### Simple vs Complex Values
+
+The structure follows these rules:
+- Simple values (strings, numbers, booleans, lists) are passed directly
+- Complex values (classes, dictionaries) use the type-params pattern
+
+For example, with dictionaries:
+```json
+{
+    "browser_config": {
+        "type": "BrowserConfig",
+        "params": {
+            "headless": true,           // Simple boolean - direct value
+            "viewport": {               // Complex dictionary - needs type-params
+                "type": "dict",
+                "value": {
+                    "width": 1200,
+                    "height": 800
+                }
+            }
+        }
+    }
+}
+```
+
+#### Strategy Pattern and Nesting
+
+Strategies (like chunking or content filtering) demonstrate why we need this structure. Consider this chunking configuration:
+
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "chunking_strategy": {
+                "type": "RegexChunking",      // Strategy implementation
+                "params": {
+                    "patterns": ["\n\n", "\\.\\s+"]
+                }
+            }
+        }
+    }
+}
+```
+
+Here, `chunking_strategy` accepts any chunking implementation. The `type` field tells the system which strategy to use, and `params` configures that specific strategy.
+
+#### Complex Nested Example
+
+Let's look at a more complex example with content filtering:
+
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "params": {
+                    "content_filter": {
+                        "type": "PruningContentFilter",
+                        "params": {
+                            "threshold": 0.48,
+                            "threshold_type": "fixed"
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+This shows how deeply configurations can nest while maintaining a consistent structure.
+
+#### Quick Grammar Overview
+```
+config := {
+    "type": string,
+    "params": {
+        key: simple_value | complex_value
+    }
+}
+
+simple_value := string | number | boolean | [simple_value]
+complex_value := config | dict_value
+
+dict_value := {
+    "type": "dict",
+    "value": object
+}
+```
+
+#### Important Rules 🚨
+
+- Always use the type-params pattern for class instances
+- Use direct values for primitives (numbers, strings, booleans)
+- Wrap dictionaries with {"type": "dict", "value": {...}}
+- Arrays/lists are passed directly without type-params
+- All parameters are optional unless specifically required
+
+#### Pro Tip 💡
+
+The easiest way to get the correct structure is to:
+1. Create configuration objects in Python
+2. Use the `dump()` method to see their JSON representation
+3. Use that JSON in your API calls
+
+Example:
+```python
+from crawl4ai import CrawlerRunConfig, PruningContentFilter
+
+config = CrawlerRunConfig(
+    markdown_generator=DefaultMarkdownGenerator(
+        content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed")
+    ),
+    cache_mode= CacheMode.BYPASS
+)
+print(config.dump())  # Use this JSON in your API calls
+```
+
+
+#### More Examples
+
+**Advanced Crawler Configuration**
+
+```json
+{
+    "urls": ["https://example.com"],
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "cache_mode": "bypass",
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "params": {
+                    "content_filter": {
+                        "type": "PruningContentFilter",
+                        "params": {
+                            "threshold": 0.48,
+                            "threshold_type": "fixed",
+                            "min_word_threshold": 0
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+**Extraction Strategy**:
+
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "extraction_strategy": {
+                "type": "JsonCssExtractionStrategy",
+                "params": {
+                    "schema": {
+                        "baseSelector": "article.post",
+                        "fields": [
+                            {"name": "title", "selector": "h1", "type": "text"},
+                            {"name": "content", "selector": ".content", "type": "html"}
+                        ]
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+**LLM Extraction Strategy**
+
+```json
+{
+  "crawler_config": {
+    "type": "CrawlerRunConfig",
+    "params": {
+      "extraction_strategy": {
+        "type": "LLMExtractionStrategy",
+        "params": {
+          "instruction": "Extract article title, author, publication date and main content",
+          "provider": "openai/gpt-4",
+          "api_token": "your-api-token",
+          "schema": {
+            "type": "dict",
+            "value": {
+              "title": "Article Schema",
+              "type": "object",
+              "properties": {
+                "title": {
+                  "type": "string",
+                  "description": "The article's headline"
+                },
+                "author": {
+                  "type": "string",
+                  "description": "The author's name"
+                },
+                "published_date": {
+                  "type": "string",
+                  "format": "date-time",
+                  "description": "Publication date and time"
+                },
+                "content": {
+                  "type": "string",
+                  "description": "The main article content"
+                }
+              },
+              "required": ["title", "content"]
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+**Deep Crawler Example**
+
+```json
+{
+  "crawler_config": {
+    "type": "CrawlerRunConfig",
+    "params": {
+      "deep_crawl_strategy": {
+        "type": "BFSDeepCrawlStrategy",
+        "params": {
+          "max_depth": 3,
+          "filter_chain": {
+            "type": "FilterChain",
+            "params": {
+              "filters": [
+                {
+                  "type": "ContentTypeFilter",
+                  "params": {
+                    "allowed_types": ["text/html", "application/xhtml+xml"]
+                  }
+                },
+                {
+                  "type": "DomainFilter",
+                  "params": {
+                    "allowed_domains": ["blog.*", "docs.*"],
+                  }
+                }
+              ]
+            }
+          },
+          "url_scorer": {
+            "type": "CompositeScorer",
+            "params": {
+              "scorers": [
+                {
+                  "type": "KeywordRelevanceScorer",
+                  "params": {
+                    "keywords": ["tutorial", "guide", "documentation"],
+                  }
+                },
+                {
+                  "type": "PathDepthScorer",
+                  "params": {
+                    "weight": 0.5,
+                    "optimal_depth": 3  
+                  }
+                }
+              ]
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+### REST API Examples
+
+Let's look at some practical examples:
+
+#### Simple Crawl
+
+```python
+import requests
+
+crawl_payload = {
+    "urls": ["https://example.com"],
+    "browser_config": {"headless": True},
+    "crawler_config": {"stream": False}
+}
+response = requests.post(
+    "http://localhost:8000/crawl",
+    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled, more on this later
+    json=crawl_payload
+)
+print(response.json())  # Print the response for debugging
+```
+
+#### Streaming Results
+
+```python
+async def test_stream_crawl(session, token: str):
+    """Test the /crawl/stream endpoint with multiple URLs."""
+    url = "http://localhost:8000/crawl/stream"
+    payload = {
+        "urls": [
+            "https://example.com",
+            "https://example.com/page1",  
+            "https://example.com/page2",  
+            "https://example.com/page3",  
+        ],
+        "browser_config": {"headless": True, "viewport": {"width": 1200}},
+        "crawler_config": {"stream": True, "cache_mode": "bypass"}
+    }
+
+    # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
+    
+    try:
+        async with session.post(url, json=payload, headers=headers) as response:
+            status = response.status
+            print(f"Status: {status} (Expected: 200)")
+            assert status == 200, f"Expected 200, got {status}"
+            
+            # Read streaming response line-by-line (NDJSON)
+            async for line in response.content:
+                if line:
+                    data = json.loads(line.decode('utf-8').strip())
+                    print(f"Streamed Result: {json.dumps(data, indent=2)}")
+    except Exception as e:
+        print(f"Error in streaming crawl test: {str(e)}")
+```
+
+## Metrics & Monitoring
+
+Keep an eye on your crawler with these endpoints:
+
+- `/health` - Quick health check
+- `/metrics` - Detailed Prometheus metrics
+- `/schema` - Full API schema
+
+Example health check:
+```bash
+curl http://localhost:8000/health
+```
+
+## Deployment Scenarios
+
+> 🚧 Coming soon! We'll cover:
+> - Kubernetes deployment
+> - Cloud provider setups (AWS, GCP, Azure)
+> - High-availability configurations
+> - Load balancing strategies
+
+## Complete Examples
+
+Check out the `examples` folder in our repository for full working examples! Here are two to get you started:
+[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py)
+[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py)
+
+## Server Configuration
+
+The server's behavior can be customized through the `config.yml` file. Let's explore how to configure your Crawl4AI server for optimal performance and security.
+
+### Understanding config.yml
+
+The configuration file is located at `deploy/docker/config.yml`. You can either modify this file before building the image or mount a custom configuration when running the container.
+
+Here's a detailed breakdown of the configuration options:
+
+```yaml
+# Application Configuration
+app:
+  title: "Crawl4AI API"           # Server title in OpenAPI docs
+  version: "1.0.0"               # API version
+  host: "0.0.0.0"               # Listen on all interfaces
+  port: 8000                    # Server port
+  reload: True                  # Enable hot reloading (development only)
+  timeout_keep_alive: 300       # Keep-alive timeout in seconds
+
+# Rate Limiting Configuration
+rate_limiting:
+  enabled: True                 # Enable/disable rate limiting
+  default_limit: "100/minute"   # Rate limit format: "number/timeunit"
+  trusted_proxies: []          # List of trusted proxy IPs
+  storage_uri: "memory://"     # Use "redis://localhost:6379" for production
+
+# Security Configuration
+security:
+  enabled: false               # Master toggle for security features
+  jwt_enabled: true            # Enable JWT authentication
+  https_redirect: True         # Force HTTPS
+  trusted_hosts: ["*"]         # Allowed hosts (use specific domains in production)
+  headers:                     # Security headers
+    x_content_type_options: "nosniff"
+    x_frame_options: "DENY"
+    content_security_policy: "default-src 'self'"
+    strict_transport_security: "max-age=63072000; includeSubDomains"
+
+# Crawler Configuration
+crawler:
+  memory_threshold_percent: 95.0  # Memory usage threshold
+  rate_limiter:
+    base_delay: [1.0, 2.0]      # Min and max delay between requests
+  timeouts:
+    stream_init: 30.0           # Stream initialization timeout
+    batch_process: 300.0        # Batch processing timeout
+
+# Logging Configuration
+logging:
+  level: "INFO"                 # Log level (DEBUG, INFO, WARNING, ERROR)
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# Observability Configuration
+observability:
+  prometheus:
+    enabled: True              # Enable Prometheus metrics
+    endpoint: "/metrics"       # Metrics endpoint
+  health_check:
+    endpoint: "/health"        # Health check endpoint
+```
+
+### JWT Authentication
+
+When `security.jwt_enabled` is set to `true` in your config.yml, all endpoints require JWT authentication via bearer tokens. Here's how it works:
+
+#### Getting a Token
+```python
+POST /token
+Content-Type: application/json
+
+{
+    "email": "user@example.com"
+}
+```
+
+The endpoint returns:
+```json
+{
+    "email": "user@example.com",
+    "access_token": "eyJ0eXAiOiJKV1QiLCJhbGciOi...",
+    "token_type": "bearer"
+}
+```
+
+#### Using the Token
+Add the token to your requests:
+```bash
+curl -H "Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGci..." http://localhost:8000/crawl
+```
+
+Using the Python SDK:
+```python
+from crawl4ai.docker_client import Crawl4aiDockerClient
+
+async with Crawl4aiDockerClient() as client:
+    # Authenticate first
+    await client.authenticate("user@example.com")
+    
+    # Now all requests will include the token automatically
+    result = await client.crawl(urls=["https://example.com"])
+```
+
+#### Production Considerations 💡
+The default implementation uses a simple email verification. For production use, consider:
+- Email verification via OTP/magic links
+- OAuth2 integration
+- Rate limiting token generation
+- Token expiration and refresh mechanisms
+- IP-based restrictions
+
+### Configuration Tips and Best Practices
+
+1. **Production Settings** 🏭
+
+   ```yaml
+   app:
+     reload: False              # Disable reload in production
+     timeout_keep_alive: 120    # Lower timeout for better resource management
+   
+   rate_limiting:
+     storage_uri: "redis://redis:6379"  # Use Redis for distributed rate limiting
+     default_limit: "50/minute"         # More conservative rate limit
+   
+   security:
+     enabled: true                      # Enable all security features
+     trusted_hosts: ["your-domain.com"] # Restrict to your domain
+   ```
+
+2. **Development Settings** 🛠️
+
+   ```yaml
+   app:
+     reload: True               # Enable hot reloading
+     timeout_keep_alive: 300    # Longer timeout for debugging
+   
+   logging:
+     level: "DEBUG"            # More verbose logging
+   ```
+
+3. **High-Traffic Settings** 🚦
+
+   ```yaml
+   crawler:
+     memory_threshold_percent: 85.0  # More conservative memory limit
+     rate_limiter:
+       base_delay: [2.0, 4.0]       # More aggressive rate limiting
+   ```
+
+### Customizing Your Configuration
+
+#### Method 1: Pre-build Configuration
+
+```bash
+# Copy and modify config before building
+cd crawl4ai/deploy
+vim custom-config.yml # Or use any editor
+
+# Build with custom config
+docker build --platform=linux/amd64 --no-cache -t crawl4ai:latest .
+```
+
+#### Method 2: Build-time Configuration
+
+Use a custom config during build:
+
+```bash
+# Build with custom config
+docker build --platform=linux/amd64 --no-cache \
+  --build-arg CONFIG_PATH=/path/to/custom-config.yml \ 
+  -t crawl4ai:latest .
+```
+
+#### Method 3: Runtime Configuration
+```bash
+# Mount custom config at runtime
+docker run -d -p 8000:8000 \
+  -v $(pwd)/custom-config.yml:/app/config.yml \
+  crawl4ai-server:prod
+```
+
+> 💡 Note: When using Method 2, `/path/to/custom-config.yml` is relative to deploy directory.
+> 💡 Note: When using Method 3, ensure your custom config file has all required fields as the container will use this instead of the built-in config.
+
+### Configuration Recommendations
+
+1. **Security First** 🔒
+   - Always enable security in production
+   - Use specific trusted_hosts instead of wildcards
+   - Set up proper rate limiting to protect your server
+   - Consider your environment before enabling HTTPS redirect
+
+2. **Resource Management** 💻
+   - Adjust memory_threshold_percent based on available RAM
+   - Set timeouts according to your content size and network conditions
+   - Use Redis for rate limiting in multi-container setups
+
+3. **Monitoring** 📊
+   - Enable Prometheus if you need metrics
+   - Set DEBUG logging in development, INFO in production
+   - Regular health check monitoring is crucial
+
+4. **Performance Tuning** ⚡
+   - Start with conservative rate limiter delays
+   - Increase batch_process timeout for large content
+   - Adjust stream_init timeout based on initial response times
+
+## Getting Help
+
+We're here to help you succeed with Crawl4AI! Here's how to get support:
+
+- 📖 Check our [full documentation](https://docs.crawl4ai.com)
+- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
+- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
+- ⭐ Star us on GitHub to show support!
+
+## Summary
+
+In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
+- Building and running the Docker container
+- Configuring the environment
+- Making API requests with proper typing
+- Using the Python SDK
+- Monitoring your deployment
+
+Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
+
+Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
+
+Happy crawling! 🕷️
+```
+
+
+## File: docs/md_v2/core/fit-markdown.md
+
+```md
+# Fit Markdown with Pruning & BM25
+
+**Fit Markdown** is a specialized **filtered** version of your page’s markdown, focusing on the most relevant content. By default, Crawl4AI converts the entire HTML into a broad **raw_markdown**. With fit markdown, we apply a **content filter** algorithm (e.g., **Pruning** or **BM25**) to remove or rank low-value sections—such as repetitive sidebars, shallow text blocks, or irrelevancies—leaving a concise textual “core.”
+
+---
+
+## 1. How “Fit Markdown” Works
+
+### 1.1 The `content_filter`
+
+In **`CrawlerRunConfig`**, you can specify a **`content_filter`** to shape how content is pruned or ranked before final markdown generation. A filter’s logic is applied **before** or **during** the HTML→Markdown process, producing:
+
+- **`result.markdown.raw_markdown`** (unfiltered)
+- **`result.markdown.fit_markdown`** (filtered or “fit” version)
+- **`result.markdown.fit_html`** (the corresponding HTML snippet that produced `fit_markdown`)
+
+
+### 1.2 Common Filters
+
+1. **PruningContentFilter** – Scores each node by text density, link density, and tag importance, discarding those below a threshold.  
+2. **BM25ContentFilter** – Focuses on textual relevance using BM25 ranking, especially useful if you have a specific user query (e.g., “machine learning” or “food nutrition”).
+
+---
+
+## 2. PruningContentFilter
+
+**Pruning** discards less relevant nodes based on **text density, link density, and tag importance**. It’s a heuristic-based approach—if certain sections appear too “thin” or too “spammy,” they’re pruned.
+
+### 2.1 Usage Example
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+    # Step 1: Create a pruning filter
+    prune_filter = PruningContentFilter(
+        # Lower → more content retained, higher → more content pruned
+        threshold=0.45,           
+        # "fixed" or "dynamic"
+        threshold_type="dynamic",  
+        # Ignore nodes with <5 words
+        min_word_threshold=5      
+    )
+
+    # Step 2: Insert it into a Markdown Generator
+    md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)
+    
+    # Step 3: Pass it to CrawlerRunConfig
+    config = CrawlerRunConfig(
+        markdown_generator=md_generator
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com", 
+            config=config
+        )
+        
+        if result.success:
+            # 'fit_markdown' is your pruned content, focusing on "denser" text
+            print("Raw Markdown length:", len(result.markdown.raw_markdown))
+            print("Fit Markdown length:", len(result.markdown.fit_markdown))
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 2.2 Key Parameters
+
+- **`min_word_threshold`** (int): If a block has fewer words than this, it’s pruned.  
+- **`threshold_type`** (str):
+  - `"fixed"` → each node must exceed `threshold` (0–1).  
+  - `"dynamic"` → node scoring adjusts according to tag type, text/link density, etc.  
+- **`threshold`** (float, default ~0.48): The base or “anchor” cutoff.  
+
+**Algorithmic Factors**:
+
+- **Text density** – Encourages blocks that have a higher ratio of text to overall content.  
+- **Link density** – Penalizes sections that are mostly links.  
+- **Tag importance** – e.g., an `<article>` or `<p>` might be more important than a `<div>`.  
+- **Structural context** – If a node is deeply nested or in a suspected sidebar, it might be deprioritized.
+
+---
+
+## 3. BM25ContentFilter
+
+**BM25** is a classical text ranking algorithm often used in search engines. If you have a **user query** or rely on page metadata to derive a query, BM25 can identify which text chunks best match that query.
+
+### 3.1 Usage Example
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import BM25ContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+    # 1) A BM25 filter with a user query
+    bm25_filter = BM25ContentFilter(
+        user_query="startup fundraising tips",
+        # Adjust for stricter or looser results
+        bm25_threshold=1.2  
+    )
+
+    # 2) Insert into a Markdown Generator
+    md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
+    
+    # 3) Pass to crawler config
+    config = CrawlerRunConfig(
+        markdown_generator=md_generator
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com", 
+            config=config
+        )
+        if result.success:
+            print("Fit Markdown (BM25 query-based):")
+            print(result.markdown.fit_markdown)
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 3.2 Parameters
+
+- **`user_query`** (str, optional): E.g. `"machine learning"`. If blank, the filter tries to glean a query from page metadata.  
+- **`bm25_threshold`** (float, default 1.0):  
+  - Higher → fewer chunks but more relevant.  
+  - Lower → more inclusive.  
+
+> In more advanced scenarios, you might see parameters like `use_stemming`, `case_sensitive`, or `priority_tags` to refine how text is tokenized or weighted.
+
+---
+
+## 4. Accessing the “Fit” Output
+
+After the crawl, your “fit” content is found in **`result.markdown.fit_markdown`**. 
+
+```python
+fit_md = result.markdown.fit_markdown
+fit_html = result.markdown.fit_html
+```
+
+If the content filter is **BM25**, you might see additional logic or references in `fit_markdown` that highlight relevant segments. If it’s **Pruning**, the text is typically well-cleaned but not necessarily matched to a query.
+
+---
+
+## 5. Code Patterns Recap
+
+### 5.1 Pruning
+
+```python
+prune_filter = PruningContentFilter(
+    threshold=0.5,
+    threshold_type="fixed",
+    min_word_threshold=10
+)
+md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)
+config = CrawlerRunConfig(markdown_generator=md_generator)
+```
+
+### 5.2 BM25
+
+```python
+bm25_filter = BM25ContentFilter(
+    user_query="health benefits fruit",
+    bm25_threshold=1.2
+)
+md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
+config = CrawlerRunConfig(markdown_generator=md_generator)
+```
+
+---
+
+## 6. Combining with “word_count_threshold” & Exclusions
+
+Remember you can also specify:
+
+```python
+config = CrawlerRunConfig(
+    word_count_threshold=10,
+    excluded_tags=["nav", "footer", "header"],
+    exclude_external_links=True,
+    markdown_generator=DefaultMarkdownGenerator(
+        content_filter=PruningContentFilter(threshold=0.5)
+    )
+)
+```
+
+Thus, **multi-level** filtering occurs:
+
+1. The crawler’s `excluded_tags` are removed from the HTML first.  
+2. The content filter (Pruning, BM25, or custom) prunes or ranks the remaining text blocks.  
+3. The final “fit” content is generated in `result.markdown.fit_markdown`.
+
+---
+
+## 7. Custom Filters
+
+If you need a different approach (like a specialized ML model or site-specific heuristics), you can create a new class inheriting from `RelevantContentFilter` and implement `filter_content(html)`. Then inject it into your **markdown generator**:
+
+```python
+from crawl4ai.content_filter_strategy import RelevantContentFilter
+
+class MyCustomFilter(RelevantContentFilter):
+    def filter_content(self, html, min_word_threshold=None):
+        # parse HTML, implement custom logic
+        return [block for block in ... if ... some condition...]
+
+```
+
+**Steps**:
+
+1. Subclass `RelevantContentFilter`.  
+2. Implement `filter_content(...)`.  
+3. Use it in your `DefaultMarkdownGenerator(content_filter=MyCustomFilter(...))`.
+
+---
+
+## 8. Final Thoughts
+
+**Fit Markdown** is a crucial feature for:
+
+- **Summaries**: Quickly get the important text from a cluttered page.  
+- **Search**: Combine with **BM25** to produce content relevant to a query.  
+- **AI Pipelines**: Filter out boilerplate so LLM-based extraction or summarization runs on denser text.
+
+**Key Points**:
+- **PruningContentFilter**: Great if you just want the “meatiest” text without a user query.  
+- **BM25ContentFilter**: Perfect for query-based extraction or searching.  
+- Combine with **`excluded_tags`, `exclude_external_links`, `word_count_threshold`** to refine your final “fit” text.  
+- Fit markdown ends up in **`result.markdown.fit_markdown`**; eventually **`result.markdown.fit_markdown`** in future versions.
+
+With these tools, you can **zero in** on the text that truly matters, ignoring spammy or boilerplate content, and produce a concise, relevant “fit markdown” for your AI or data pipelines. Happy pruning and searching!
+
+- Last Updated: 2025-01-01
+```
+
+
+## File: docs/md_v2/core/installation.md
+
+```md
+# Installation & Setup (2023 Edition)
+
+## 1. Basic Installation
+
+```bash
+pip install crawl4ai
+```
+
+This installs the **core** Crawl4AI library along with essential dependencies. **No** advanced features (like transformers or PyTorch) are included yet.
+
+## 2. Initial Setup & Diagnostics
+
+### 2.1 Run the Setup Command
+After installing, call:
+
+```bash
+crawl4ai-setup
+```
+
+**What does it do?**
+- Installs or updates required Playwright browsers (Chromium, Firefox, etc.)
+- Performs OS-level checks (e.g., missing libs on Linux)
+- Confirms your environment is ready to crawl
+
+### 2.2 Diagnostics
+Optionally, you can run **diagnostics** to confirm everything is functioning:
+
+```bash
+crawl4ai-doctor
+```
+
+This command attempts to:
+- Check Python version compatibility
+- Verify Playwright installation
+- Inspect environment variables or library conflicts
+
+If any issues arise, follow its suggestions (e.g., installing additional system packages) and re-run `crawl4ai-setup`.
+
+---
+
+## 3. Verifying Installation: A Simple Crawl (Skip this step if you already run `crawl4ai-doctor`)
+
+Below is a minimal Python script demonstrating a **basic** crawl. It uses our new **`BrowserConfig`** and **`CrawlerRunConfig`** for clarity, though no custom settings are passed in this example:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+        )
+        print(result.markdown[:300])  # Show the first 300 characters of extracted text
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Expected** outcome:
+- A headless browser session loads `example.com`
+- Crawl4AI returns ~300 characters of markdown.  
+If errors occur, rerun `crawl4ai-doctor` or manually ensure Playwright is installed correctly.
+
+---
+
+## 4. Advanced Installation (Optional)
+
+**Warning**: Only install these **if you truly need them**. They bring in larger dependencies, including big models, which can increase disk usage and memory load significantly.
+
+### 4.1 Torch, Transformers, or All
+
+- **Text Clustering (Torch)**  
+  ```bash
+  pip install crawl4ai[torch]
+  crawl4ai-setup
+  ```
+  Installs PyTorch-based features (e.g., cosine similarity or advanced semantic chunking).
+
+- **Transformers**  
+  ```bash
+  pip install crawl4ai[transformer]
+  crawl4ai-setup
+  ```
+  Adds Hugging Face-based summarization or generation strategies.
+
+- **All Features**  
+  ```bash
+  pip install crawl4ai[all]
+  crawl4ai-setup
+  ```
+
+#### (Optional) Pre-Fetching Models
+```bash
+crawl4ai-download-models
+```
+This step caches large models locally (if needed). **Only do this** if your workflow requires them.
+
+---
+
+## 5. Docker (Experimental)
+
+We provide a **temporary** Docker approach for testing. **It’s not stable and may break** with future releases. We plan a major Docker revamp in a future stable version, 2025 Q1. If you still want to try:
+
+```bash
+docker pull unclecode/crawl4ai:basic
+docker run -p 11235:11235 unclecode/crawl4ai:basic
+```
+
+You can then make POST requests to `http://localhost:11235/crawl` to perform crawls. **Production usage** is discouraged until our new Docker approach is ready (planned in Jan or Feb 2025).
+
+---
+
+## 6. Local Server Mode (Legacy)
+
+Some older docs mention running Crawl4AI as a local server. This approach has been **partially replaced** by the new Docker-based prototype and upcoming stable server release. You can experiment, but expect major changes. Official local server instructions will arrive once the new Docker architecture is finalized.
+
+---
+
+## Summary
+
+1. **Install** with `pip install crawl4ai` and run `crawl4ai-setup`.
+2. **Diagnose** with `crawl4ai-doctor` if you see errors.
+3. **Verify** by crawling `example.com` with minimal `BrowserConfig` + `CrawlerRunConfig`.
+4. **Advanced** features (Torch, Transformers) are **optional**—avoid them if you don’t need them (they significantly increase resource usage).
+5. **Docker** is **experimental**—use at your own risk until the stable version is released.
+6. **Local server** references in older docs are largely deprecated; a new solution is in progress.
+
+**Got questions?** Check [GitHub issues](https://github.com/unclecode/crawl4ai/issues) for updates or ask the community!
+```
+
+
+## File: docs/md_v2/core/link-media.md
+
+```md
+# Link & Media 
+
+In this tutorial, you’ll learn how to:
+
+1. Extract links (internal, external) from crawled pages  
+2. Filter or exclude specific domains (e.g., social media or custom domains)  
+3. Access and ma### 3.2 Excluding Images
+
+#### Excluding External Images
+
+If you're dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_external_images=True
+)
+```
+
+This setting attempts to discard images from outside the primary domain, keeping only those from the site you're crawling.
+
+#### Excluding All Images
+
+If you want to completely remove all images from the page to maximize performance and reduce memory usage, use:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_all_images=True
+)
+```
+
+This setting removes all images very early in the processing pipeline, which significantly improves memory efficiency and processing speed. This is particularly useful when:
+- You don't need image data in your results
+- You're crawling image-heavy pages that cause memory issues
+- You want to focus only on text content
+- You need to maximize crawling speeddata (especially images) in the crawl result  
+4. Configure your crawler to exclude or prioritize certain images
+
+> **Prerequisites**  
+> - You have completed or are familiar with the [AsyncWebCrawler Basics](../core/simple-crawling.md) tutorial.  
+> - You can run Crawl4AI in your environment (Playwright, Python, etc.).
+
+---
+
+Below is a revised version of the **Link Extraction** and **Media Extraction** sections that includes example data structures showing how links and media items are stored in `CrawlResult`. Feel free to adjust any field names or descriptions to match your actual output.
+
+---
+
+## 1. Link Extraction
+
+### 1.1 `result.links`
+
+When you call `arun()` or `arun_many()` on a URL, Crawl4AI automatically extracts links and stores them in the `links` field of `CrawlResult`. By default, the crawler tries to distinguish **internal** links (same domain) from **external** links (different domains).
+
+**Basic Example**:
+
+```python
+from crawl4ai import AsyncWebCrawler
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun("https://www.example.com")
+    if result.success:
+        internal_links = result.links.get("internal", [])
+        external_links = result.links.get("external", [])
+        print(f"Found {len(internal_links)} internal links.")
+        print(f"Found {len(internal_links)} external links.")
+        print(f"Found {len(result.media)} media items.")
+
+        # Each link is typically a dictionary with fields like:
+        # { "href": "...", "text": "...", "title": "...", "base_domain": "..." }
+        if internal_links:
+            print("Sample Internal Link:", internal_links[0])
+    else:
+        print("Crawl failed:", result.error_message)
+```
+
+**Structure Example**:
+
+```python
+result.links = {
+  "internal": [
+    {
+      "href": "https://kidocode.com/",
+      "text": "",
+      "title": "",
+      "base_domain": "kidocode.com"
+    },
+    {
+      "href": "https://kidocode.com/degrees/technology",
+      "text": "Technology Degree",
+      "title": "KidoCode Tech Program",
+      "base_domain": "kidocode.com"
+    },
+    # ...
+  ],
+  "external": [
+    # possibly other links leading to third-party sites
+  ]
+}
+```
+
+- **`href`**: The raw hyperlink URL.  
+- **`text`**: The link text (if any) within the `<a>` tag.  
+- **`title`**: The `title` attribute of the link (if present).  
+- **`base_domain`**: The domain extracted from `href`. Helpful for filtering or grouping by domain.
+
+---
+
+## 2. Domain Filtering
+
+Some websites contain hundreds of third-party or affiliate links. You can filter out certain domains at **crawl time** by configuring the crawler. The most relevant parameters in `CrawlerRunConfig` are:
+
+- **`exclude_external_links`**: If `True`, discard any link pointing outside the root domain.  
+- **`exclude_social_media_domains`**: Provide a list of social media platforms (e.g., `["facebook.com", "twitter.com"]`) to exclude from your crawl.  
+- **`exclude_social_media_links`**: If `True`, automatically skip known social platforms.  
+- **`exclude_domains`**: Provide a list of custom domains you want to exclude (e.g., `["spammyads.com", "tracker.net"]`).
+
+### 2.1 Example: Excluding External & Social Media Links
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    crawler_cfg = CrawlerRunConfig(
+        exclude_external_links=True,          # No links outside primary domain
+        exclude_social_media_links=True       # Skip recognized social media domains
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            "https://www.example.com",
+            config=crawler_cfg
+        )
+        if result.success:
+            print("[OK] Crawled:", result.url)
+            print("Internal links count:", len(result.links.get("internal", [])))
+            print("External links count:", len(result.links.get("external", [])))  
+            # Likely zero external links in this scenario
+        else:
+            print("[ERROR]", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 2.2 Example: Excluding Specific Domains
+
+If you want to let external links in, but specifically exclude a domain (e.g., `suspiciousads.com`), do this:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_domains=["suspiciousads.com"]
+)
+```
+
+This approach is handy when you still want external links but need to block certain sites you consider spammy.
+
+---
+
+## 3. Media Extraction
+
+### 3.1 Accessing `result.media`
+
+By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
+
+**Basic Example**:
+
+```python
+if result.success:
+    # Get images
+    images_info = result.media.get("images", [])
+    print(f"Found {len(images_info)} images in total.")
+    for i, img in enumerate(images_info[:3]):  # Inspect just the first 3
+        print(f"[Image {i}] URL: {img['src']}")
+        print(f"           Alt text: {img.get('alt', '')}")
+        print(f"           Score: {img.get('score')}")
+        print(f"           Description: {img.get('desc', '')}\n")
+    
+    # Get tables
+    tables = result.media.get("tables", [])
+    print(f"Found {len(tables)} data tables in total.")
+    for i, table in enumerate(tables):
+        print(f"[Table {i}] Caption: {table.get('caption', 'No caption')}")
+        print(f"           Columns: {len(table.get('headers', []))}")
+        print(f"           Rows: {len(table.get('rows', []))}")
+```
+
+**Structure Example**:
+
+```python
+result.media = {
+  "images": [
+    {
+      "src": "https://cdn.prod.website-files.com/.../Group%2089.svg",
+      "alt": "coding school for kids",
+      "desc": "Trial Class Degrees degrees All Degrees AI Degree Technology ...",
+      "score": 3,
+      "type": "image",
+      "group_id": 0,
+      "format": None,
+      "width": None,
+      "height": None
+    },
+    # ...
+  ],
+  "videos": [
+    # Similar structure but with video-specific fields
+  ],
+  "audio": [
+    # Similar structure but with audio-specific fields
+  ],
+  "tables": [
+    {
+      "headers": ["Name", "Age", "Location"],
+      "rows": [
+        ["John Doe", "34", "New York"],
+        ["Jane Smith", "28", "San Francisco"],
+        ["Alex Johnson", "42", "Chicago"]
+      ],
+      "caption": "Employee Directory",
+      "summary": "Directory of company employees"
+    },
+    # More tables if present
+  ]
+}
+```
+
+Depending on your Crawl4AI version or scraping strategy, these dictionaries can include fields like:
+
+- **`src`**: The media URL (e.g., image source)  
+- **`alt`**: The alt text for images (if present)  
+- **`desc`**: A snippet of nearby text or a short description (optional)  
+- **`score`**: A heuristic relevance score if you’re using content-scoring features  
+- **`width`**, **`height`**: If the crawler detects dimensions for the image/video  
+- **`type`**: Usually `"image"`, `"video"`, or `"audio"`  
+- **`group_id`**: If you’re grouping related media items, the crawler might assign an ID  
+
+With these details, you can easily filter out or focus on certain images (for instance, ignoring images with very low scores or a different domain), or gather metadata for analytics.
+
+### 3.2 Excluding External Images
+
+If you’re dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_external_images=True
+)
+```
+
+This setting attempts to discard images from outside the primary domain, keeping only those from the site you’re crawling.
+
+### 3.3 Working with Tables
+
+Crawl4AI can detect and extract structured data from HTML tables. Tables are analyzed based on various criteria to determine if they are actual data tables (as opposed to layout tables), including:
+
+- Presence of thead and tbody sections
+- Use of th elements for headers
+- Column consistency
+- Text density
+- And other factors
+
+Tables that score above the threshold (default: 7) are extracted and stored in `result.media.tables`.
+
+**Accessing Table Data**:
+
+```python
+if result.success:
+    tables = result.media.get("tables", [])
+    print(f"Found {len(tables)} data tables on the page")
+    
+    if tables:
+        # Access the first table
+        first_table = tables[0]
+        print(f"Table caption: {first_table.get('caption', 'No caption')}")
+        print(f"Headers: {first_table.get('headers', [])}")
+        
+        # Print the first 3 rows
+        for i, row in enumerate(first_table.get('rows', [])[:3]):
+            print(f"Row {i+1}: {row}")
+```
+
+**Configuring Table Extraction**:
+
+You can adjust the sensitivity of the table detection algorithm with:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    table_score_threshold=5  # Lower value = more tables detected (default: 7)
+)
+```
+
+Each extracted table contains:
+- `headers`: Column header names
+- `rows`: List of rows, each containing cell values
+- `caption`: Table caption text (if available)
+- `summary`: Table summary attribute (if specified)
+
+### 3.4 Additional Media Config
+
+- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.  
+- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.  
+- **`capture_mhtml`**: Set to `True` if you want an MHTML snapshot of the page in `result.mhtml`. This format preserves the entire web page with all its resources (CSS, images, scripts) in a single file, making it perfect for archiving or offline viewing.
+- **`wait_for_images`**: If `True`, attempts to wait until images are fully loaded before final extraction.
+
+#### Example: Capturing Page as MHTML
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    crawler_cfg = CrawlerRunConfig(
+        capture_mhtml=True  # Enable MHTML capture
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=crawler_cfg)
+        
+        if result.success and result.mhtml:
+            # Save the MHTML snapshot to a file
+            with open("example.mhtml", "w", encoding="utf-8") as f:
+                f.write(result.mhtml)
+            print("MHTML snapshot saved to example.mhtml")
+        else:
+            print("Failed to capture MHTML:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+The MHTML format is particularly useful because:
+- It captures the complete page state including all resources
+- It can be opened in most modern browsers for offline viewing
+- It preserves the page exactly as it appeared during crawling
+- It's a single file, making it easy to store and transfer
+
+---
+
+## 4. Putting It All Together: Link & Media Filtering
+
+Here’s a combined example demonstrating how to filter out external links, skip certain domains, and exclude external images:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    # Suppose we want to keep only internal links, remove certain domains, 
+    # and discard external images from the final crawl data.
+    crawler_cfg = CrawlerRunConfig(
+        exclude_external_links=True,
+        exclude_domains=["spammyads.com"],
+        exclude_social_media_links=True,   # skip Twitter, Facebook, etc.
+        exclude_external_images=True,      # keep only images from main domain
+        wait_for_images=True,             # ensure images are loaded
+        verbose=True
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://www.example.com", config=crawler_cfg)
+
+        if result.success:
+            print("[OK] Crawled:", result.url)
+            
+            # 1. Links
+            in_links = result.links.get("internal", [])
+            ext_links = result.links.get("external", [])
+            print("Internal link count:", len(in_links))
+            print("External link count:", len(ext_links))  # should be zero with exclude_external_links=True
+            
+            # 2. Images
+            images = result.media.get("images", [])
+            print("Images found:", len(images))
+            
+            # Let's see a snippet of these images
+            for i, img in enumerate(images[:3]):
+                print(f"  - {img['src']} (alt={img.get('alt','')}, score={img.get('score','N/A')})")
+        else:
+            print("[ERROR] Failed to crawl. Reason:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 5. Common Pitfalls & Tips
+
+1. **Conflicting Flags**:  
+   - `exclude_external_links=True` but then also specifying `exclude_social_media_links=True` is typically fine, but understand that the first setting already discards *all* external links. The second becomes somewhat redundant.  
+   - `exclude_external_images=True` but want to keep some external images? Currently no partial domain-based setting for images, so you might need a custom approach or hook logic.
+
+2. **Relevancy Scores**:  
+   - If your version of Crawl4AI or your scraping strategy includes an `img["score"]`, it’s typically a heuristic based on size, position, or content analysis. Evaluate carefully if you rely on it.
+
+3. **Performance**:  
+   - Excluding certain domains or external images can speed up your crawl, especially for large, media-heavy pages.  
+   - If you want a “full” link map, do *not* exclude them. Instead, you can post-filter in your own code.
+
+4. **Social Media Lists**:  
+   - `exclude_social_media_links=True` typically references an internal list of known social domains like Facebook, Twitter, LinkedIn, etc. If you need to add or remove from that list, look for library settings or a local config file (depending on your version).
+
+---
+
+**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
+### Table Extraction Tips
+
+- Not all HTML tables are extracted - only those detected as "data tables" vs. layout tables.
+- Tables with inconsistent cell counts, nested tables, or those used purely for layout may be skipped.
+- If you're missing tables, try adjusting the `table_score_threshold` to a lower value (default is 7).
+
+The table detection algorithm scores tables based on features like consistent columns, presence of headers, text density, and more. Tables scoring above the threshold are considered data tables worth extracting.
+
+```
+
+
+## File: docs/md_v2/core/local-files.md
+
+```md
+# Prefix-Based Input Handling in Crawl4AI
+
+This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example.
+
+## Crawling a Web URL
+
+To crawl a live web page, provide the URL starting with `http://` or `https://`, using a `CrawlerRunConfig` object:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def crawl_web():
+    config = CrawlerRunConfig(bypass_cache=True)
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/apple", 
+            config=config
+        )
+        if result.success:
+            print("Markdown Content:")
+            print(result.markdown)
+        else:
+            print(f"Failed to crawl: {result.error_message}")
+
+asyncio.run(crawl_web())
+```
+
+## Crawling a Local HTML File
+
+To crawl a local HTML file, prefix the file path with `file://`.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def crawl_local_file():
+    local_file_path = "/path/to/apple.html"  # Replace with your file path
+    file_url = f"file://{local_file_path}"
+    config = CrawlerRunConfig(bypass_cache=True)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=file_url, config=config)
+        if result.success:
+            print("Markdown Content from Local File:")
+            print(result.markdown)
+        else:
+            print(f"Failed to crawl local file: {result.error_message}")
+
+asyncio.run(crawl_local_file())
+```
+
+## Crawling Raw HTML Content
+
+To crawl raw HTML content, prefix the HTML string with `raw:`.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def crawl_raw_html():
+    raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
+    raw_html_url = f"raw:{raw_html}"
+    config = CrawlerRunConfig(bypass_cache=True)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=raw_html_url, config=config)
+        if result.success:
+            print("Markdown Content from Raw HTML:")
+            print(result.markdown)
+        else:
+            print(f"Failed to crawl raw HTML: {result.error_message}")
+
+asyncio.run(crawl_raw_html())
+```
+
+---
+
+# Complete Example
+
+Below is a comprehensive script that:
+
+1. Crawls the Wikipedia page for "Apple."
+2. Saves the HTML content to a local file (`apple.html`).
+3. Crawls the local HTML file and verifies the markdown length matches the original crawl.
+4. Crawls the raw HTML content from the saved file and verifies consistency.
+
+```python
+import os
+import sys
+import asyncio
+from pathlib import Path
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def main():
+    wikipedia_url = "https://en.wikipedia.org/wiki/apple"
+    script_dir = Path(__file__).parent
+    html_file_path = script_dir / "apple.html"
+
+    async with AsyncWebCrawler() as crawler:
+        # Step 1: Crawl the Web URL
+        print("\n=== Step 1: Crawling the Wikipedia URL ===")
+        web_config = CrawlerRunConfig(bypass_cache=True)
+        result = await crawler.arun(url=wikipedia_url, config=web_config)
+
+        if not result.success:
+            print(f"Failed to crawl {wikipedia_url}: {result.error_message}")
+            return
+
+        with open(html_file_path, 'w', encoding='utf-8') as f:
+            f.write(result.html)
+        web_crawl_length = len(result.markdown)
+        print(f"Length of markdown from web crawl: {web_crawl_length}\n")
+
+        # Step 2: Crawl from the Local HTML File
+        print("=== Step 2: Crawling from the Local HTML File ===")
+        file_url = f"file://{html_file_path.resolve()}"
+        file_config = CrawlerRunConfig(bypass_cache=True)
+        local_result = await crawler.arun(url=file_url, config=file_config)
+
+        if not local_result.success:
+            print(f"Failed to crawl local file {file_url}: {local_result.error_message}")
+            return
+
+        local_crawl_length = len(local_result.markdown)
+        assert web_crawl_length == local_crawl_length, "Markdown length mismatch"
+        print("✅ Markdown length matches between web and local file crawl.\n")
+
+        # Step 3: Crawl Using Raw HTML Content
+        print("=== Step 3: Crawling Using Raw HTML Content ===")
+        with open(html_file_path, 'r', encoding='utf-8') as f:
+            raw_html_content = f.read()
+        raw_html_url = f"raw:{raw_html_content}"
+        raw_config = CrawlerRunConfig(bypass_cache=True)
+        raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
+
+        if not raw_result.success:
+            print(f"Failed to crawl raw HTML content: {raw_result.error_message}")
+            return
+
+        raw_crawl_length = len(raw_result.markdown)
+        assert web_crawl_length == raw_crawl_length, "Markdown length mismatch"
+        print("✅ Markdown length matches between web and raw HTML crawl.\n")
+
+        print("All tests passed successfully!")
+    if html_file_path.exists():
+        os.remove(html_file_path)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+# Conclusion
+
+With the unified `url` parameter and prefix-based handling in **Crawl4AI**, you can seamlessly handle web URLs, local HTML files, and raw HTML content. Use `CrawlerRunConfig` for flexible and consistent configuration in all scenarios.
+```
+
+
+## File: docs/md_v2/core/markdown-generation.md
+
+```md
+# Markdown Generation Basics
+
+One of Crawl4AI’s core features is generating **clean, structured markdown** from web pages. Originally built to solve the problem of extracting only the “actual” content and discarding boilerplate or noise, Crawl4AI’s markdown system remains one of its biggest draws for AI workflows.
+
+In this tutorial, you’ll learn:
+
+1. How to configure the **Default Markdown Generator**  
+2. How **content filters** (BM25 or Pruning) help you refine markdown and discard junk  
+3. The difference between raw markdown (`result.markdown`) and filtered markdown (`fit_markdown`)  
+
+> **Prerequisites**  
+> - You’ve completed or read [AsyncWebCrawler Basics](../core/simple-crawling.md) to understand how to run a simple crawl.  
+> - You know how to configure `CrawlerRunConfig`.
+
+---
+
+## 1. Quick Example
+
+Here’s a minimal code snippet that uses the **DefaultMarkdownGenerator** with no additional filtering:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+    config = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator()
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        
+        if result.success:
+            print("Raw Markdown Output:\n")
+            print(result.markdown)  # The unfiltered markdown from the page
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What’s happening?**  
+- `CrawlerRunConfig( markdown_generator = DefaultMarkdownGenerator() )` instructs Crawl4AI to convert the final HTML into markdown at the end of each crawl.  
+- The resulting markdown is accessible via `result.markdown`.
+
+---
+
+## 2. How Markdown Generation Works
+
+### 2.1 HTML-to-Text Conversion (Forked & Modified)
+
+Under the hood, **DefaultMarkdownGenerator** uses a specialized HTML-to-text approach that:
+
+- Preserves headings, code blocks, bullet points, etc.  
+- Removes extraneous tags (scripts, styles) that don’t add meaningful content.  
+- Can optionally generate references for links or skip them altogether.
+
+A set of **options** (passed as a dict) allows you to customize precisely how HTML converts to markdown. These map to standard html2text-like configuration plus your own enhancements (e.g., ignoring internal links, preserving certain tags verbatim, or adjusting line widths).
+
+### 2.2 Link Citations & References
+
+By default, the generator can convert `<a href="...">` elements into `[text][1]` citations, then place the actual links at the bottom of the document. This is handy for research workflows that demand references in a structured manner.
+
+### 2.3 Optional Content Filters
+
+Before or after the HTML-to-Markdown step, you can apply a **content filter** (like BM25 or Pruning) to reduce noise and produce a “fit_markdown”—a heavily pruned version focusing on the page’s main text. We’ll cover these filters shortly.
+
+---
+
+## 3. Configuring the Default Markdown Generator
+
+You can tweak the output by passing an `options` dict to `DefaultMarkdownGenerator`. For example:
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Example: ignore all links, don't escape HTML, and wrap text at 80 characters
+    md_generator = DefaultMarkdownGenerator(
+        options={
+            "ignore_links": True,
+            "escape_html": False,
+            "body_width": 80
+        }
+    )
+
+    config = CrawlerRunConfig(
+        markdown_generator=md_generator
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com/docs", config=config)
+        if result.success:
+            print("Markdown:\n", result.markdown[:500])  # Just a snippet
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+Some commonly used `options`:
+
+- **`ignore_links`** (bool): Whether to remove all hyperlinks in the final markdown.  
+- **`ignore_images`** (bool): Remove all `![image]()` references.  
+- **`escape_html`** (bool): Turn HTML entities into text (default is often `True`).  
+- **`body_width`** (int): Wrap text at N characters. `0` or `None` means no wrapping.  
+- **`skip_internal_links`** (bool): If `True`, omit `#localAnchors` or internal links referencing the same page.  
+- **`include_sup_sub`** (bool): Attempt to handle `<sup>` / `<sub>` in a more readable way.
+
+## 4. Selecting the HTML Source for Markdown Generation
+
+The `content_source` parameter allows you to control which HTML content is used as input for markdown generation. This gives you flexibility in how the HTML is processed before conversion to markdown.
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Option 1: Use the raw HTML directly from the webpage (before any processing)
+    raw_md_generator = DefaultMarkdownGenerator(
+        content_source="raw_html",
+        options={"ignore_links": True}
+    )
+    
+    # Option 2: Use the cleaned HTML (after scraping strategy processing - default)
+    cleaned_md_generator = DefaultMarkdownGenerator(
+        content_source="cleaned_html",  # This is the default
+        options={"ignore_links": True}
+    )
+    
+    # Option 3: Use preprocessed HTML optimized for schema extraction
+    fit_md_generator = DefaultMarkdownGenerator(
+        content_source="fit_html",
+        options={"ignore_links": True}
+    )
+    
+    # Use one of the generators in your crawler config
+    config = CrawlerRunConfig(
+        markdown_generator=raw_md_generator  # Try each of the generators
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        if result.success:
+            print("Markdown:\n", result.markdown.raw_markdown[:500])
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+### HTML Source Options
+
+- **`"cleaned_html"`** (default): Uses the HTML after it has been processed by the scraping strategy. This HTML is typically cleaner and more focused on content, with some boilerplate removed.
+
+- **`"raw_html"`**: Uses the original HTML directly from the webpage, before any cleaning or processing. This preserves more of the original content, but may include navigation bars, ads, footers, and other elements that might not be relevant to the main content.
+
+- **`"fit_html"`**: Uses HTML preprocessed for schema extraction. This HTML is optimized for structured data extraction and may have certain elements simplified or removed.
+
+### When to Use Each Option
+
+- Use **`"cleaned_html"`** (default) for most cases where you want a balance of content preservation and noise removal.
+- Use **`"raw_html"`** when you need to preserve all original content, or when the cleaning process is removing content you actually want to keep.
+- Use **`"fit_html"`** when working with structured data or when you need HTML that's optimized for schema extraction.
+
+---
+
+## 5. Content Filters
+
+**Content filters** selectively remove or rank sections of text before turning them into Markdown. This is especially helpful if your page has ads, nav bars, or other clutter you don’t want.
+
+### 5.1 BM25ContentFilter
+
+If you have a **search query**, BM25 is a good choice:
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import BM25ContentFilter
+from crawl4ai import CrawlerRunConfig
+
+bm25_filter = BM25ContentFilter(
+    user_query="machine learning",
+    bm25_threshold=1.2,
+    use_stemming=True
+)
+
+md_generator = DefaultMarkdownGenerator(
+    content_filter=bm25_filter,
+    options={"ignore_links": True}
+)
+
+config = CrawlerRunConfig(markdown_generator=md_generator)
+```
+
+- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.  
+- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.  
+- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”).
+
+**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
+
+### 5.2 PruningContentFilter
+
+If you **don’t** have a specific query, or if you just want a robust “junk remover,” use `PruningContentFilter`. It analyzes text density, link density, HTML structure, and known patterns (like “nav,” “footer”) to systematically prune extraneous or repetitive sections.
+
+```python
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+prune_filter = PruningContentFilter(
+    threshold=0.5,
+    threshold_type="fixed",  # or "dynamic"
+    min_word_threshold=50
+)
+```
+
+- **`threshold`**: Score boundary. Blocks below this score get removed.  
+- **`threshold_type`**:  
+    - `"fixed"`: Straight comparison (`score >= threshold` keeps the block).  
+    - `"dynamic"`: The filter adjusts threshold in a data-driven manner.  
+- **`min_word_threshold`**: Discard blocks under N words as likely too short or unhelpful.
+
+**When to Use PruningContentFilter**  
+- You want a broad cleanup without a user query.  
+- The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction.
+
+### 5.3 LLMContentFilter
+
+For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai.content_filter_strategy import LLMContentFilter
+
+async def main():
+    # Initialize LLM filter with specific instruction
+    filter = LLMContentFilter(
+        llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
+        instruction="""
+        Focus on extracting the core educational content.
+        Include:
+        - Key concepts and explanations
+        - Important code examples
+        - Essential technical details
+        Exclude:
+        - Navigation elements
+        - Sidebars
+        - Footer content
+        Format the output as clean markdown with proper code blocks and headers.
+        """,
+        chunk_token_threshold=4096,  # Adjust based on your needs
+        verbose=True
+    )
+
+    config = CrawlerRunConfig(
+        content_filter=filter
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        print(result.markdown.fit_markdown)  # Filtered markdown content
+```
+
+**Key Features:**
+- **Intelligent Filtering**: Uses LLMs to understand and extract relevant content while maintaining context
+- **Customizable Instructions**: Tailor the filtering process with specific instructions
+- **Chunk Processing**: Handles large documents by processing them in chunks (controlled by `chunk_token_threshold`)
+- **Parallel Processing**: For better performance, use smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks
+
+**Two Common Use Cases:**
+
+1. **Exact Content Preservation**:
+```python
+filter = LLMContentFilter(
+    instruction="""
+    Extract the main educational content while preserving its original wording and substance completely.
+    1. Maintain the exact language and terminology
+    2. Keep all technical explanations and examples intact
+    3. Preserve the original flow and structure
+    4. Remove only clearly irrelevant elements like navigation menus and ads
+    """,
+    chunk_token_threshold=4096
+)
+```
+
+2. **Focused Content Extraction**:
+```python
+filter = LLMContentFilter(
+    instruction="""
+    Focus on extracting specific types of content:
+    - Technical documentation
+    - Code examples
+    - API references
+    Reformat the content into clear, well-structured markdown
+    """,
+    chunk_token_threshold=4096
+)
+```
+
+> **Performance Tip**: Set a smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks. The default value is infinity, which processes the entire content as a single chunk.
+
+---
+
+## 6. Using Fit Markdown
+
+When a content filter is active, the library produces two forms of markdown inside `result.markdown`:
+
+1. **`raw_markdown`**: The full unfiltered markdown.  
+2. **`fit_markdown`**: A “fit” version where the filter has removed or trimmed noisy segments.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+async def main():
+    config = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(threshold=0.6),
+            options={"ignore_links": True}
+        )
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://news.example.com/tech", config=config)
+        if result.success:
+            print("Raw markdown:\n", result.markdown)
+            
+            # If a filter is used, we also have .fit_markdown:
+            md_object = result.markdown  # or your equivalent
+            print("Filtered markdown:\n", md_object.fit_markdown)
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 7. The `MarkdownGenerationResult` Object
+
+If your library stores detailed markdown output in an object like `MarkdownGenerationResult`, you’ll see fields such as:
+
+- **`raw_markdown`**: The direct HTML-to-markdown transformation (no filtering).  
+- **`markdown_with_citations`**: A version that moves links to reference-style footnotes.  
+- **`references_markdown`**: A separate string or section containing the gathered references.  
+- **`fit_markdown`**: The filtered markdown if you used a content filter.  
+- **`fit_html`**: The corresponding HTML snippet used to generate `fit_markdown` (helpful for debugging or advanced usage).
+
+**Example**:
+
+```python
+md_obj = result.markdown  # your library’s naming may vary
+print("RAW:\n", md_obj.raw_markdown)
+print("CITED:\n", md_obj.markdown_with_citations)
+print("REFERENCES:\n", md_obj.references_markdown)
+print("FIT:\n", md_obj.fit_markdown)
+```
+
+**Why Does This Matter?**  
+- You can supply `raw_markdown` to an LLM if you want the entire text.  
+- Or feed `fit_markdown` into a vector database to reduce token usage.  
+- `references_markdown` can help you keep track of link provenance.
+
+---
+
+Below is a **revised section** under “Combining Filters (BM25 + Pruning)” that demonstrates how you can run **two** passes of content filtering without re-crawling, by taking the HTML (or text) from a first pass and feeding it into the second filter. It uses real code patterns from the snippet you provided for **BM25ContentFilter**, which directly accepts **HTML** strings (and can also handle plain text with minimal adaptation).
+
+---
+
+## 8. Combining Filters (BM25 + Pruning) in Two Passes
+
+You might want to **prune out** noisy boilerplate first (with `PruningContentFilter`), and then **rank what’s left** against a user query (with `BM25ContentFilter`). You don’t have to crawl the page twice. Instead:
+
+1. **First pass**: Apply `PruningContentFilter` directly to the raw HTML from `result.html` (the crawler’s downloaded HTML).  
+2. **Second pass**: Take the pruned HTML (or text) from step 1, and feed it into `BM25ContentFilter`, focusing on a user query.
+
+### Two-Pass Example
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
+from bs4 import BeautifulSoup
+
+async def main():
+    # 1. Crawl with minimal or no markdown generator, just get raw HTML
+    config = CrawlerRunConfig(
+        # If you only want raw HTML, you can skip passing a markdown_generator
+        # or provide one but focus on .html in this example
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com/tech-article", config=config)
+
+        if not result.success or not result.html:
+            print("Crawl failed or no HTML content.")
+            return
+        
+        raw_html = result.html
+        
+        # 2. First pass: PruningContentFilter on raw HTML
+        pruning_filter = PruningContentFilter(threshold=0.5, min_word_threshold=50)
+        
+        # filter_content returns a list of "text chunks" or cleaned HTML sections
+        pruned_chunks = pruning_filter.filter_content(raw_html)
+        # This list is basically pruned content blocks, presumably in HTML or text form
+        
+        # For demonstration, let's combine these chunks back into a single HTML-like string
+        # or you could do further processing. It's up to your pipeline design.
+        pruned_html = "\n".join(pruned_chunks)
+        
+        # 3. Second pass: BM25ContentFilter with a user query
+        bm25_filter = BM25ContentFilter(
+            user_query="machine learning",
+            bm25_threshold=1.2,
+            language="english"
+        )
+        
+        # returns a list of text chunks
+        bm25_chunks = bm25_filter.filter_content(pruned_html)  
+        
+        if not bm25_chunks:
+            print("Nothing matched the BM25 query after pruning.")
+            return
+        
+        # 4. Combine or display final results
+        final_text = "\n---\n".join(bm25_chunks)
+        
+        print("==== PRUNED OUTPUT (first pass) ====")
+        print(pruned_html[:500], "... (truncated)")  # preview
+
+        print("\n==== BM25 OUTPUT (second pass) ====")
+        print(final_text[:500], "... (truncated)")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### What’s Happening?
+
+1. **Raw HTML**: We crawl once and store the raw HTML in `result.html`.  
+2. **PruningContentFilter**: Takes HTML + optional parameters. It extracts blocks of text or partial HTML, removing headings/sections deemed “noise.” It returns a **list of text chunks**.  
+3. **Combine or Transform**: We join these pruned chunks back into a single HTML-like string. (Alternatively, you could store them in a list for further logic—whatever suits your pipeline.)  
+4. **BM25ContentFilter**: We feed the pruned string into `BM25ContentFilter` with a user query. This second pass further narrows the content to chunks relevant to “machine learning.”
+
+**No Re-Crawling**: We used `raw_html` from the first pass, so there’s no need to run `arun()` again—**no second network request**.
+
+### Tips & Variations
+
+- **Plain Text vs. HTML**: If your pruned output is mostly text, BM25 can still handle it; just keep in mind it expects a valid string input. If you supply partial HTML (like `"<p>some text</p>"`), it will parse it as HTML.  
+- **Chaining in a Single Pipeline**: If your code supports it, you can chain multiple filters automatically. Otherwise, manual two-pass filtering (as shown) is straightforward.  
+- **Adjust Thresholds**: If you see too much or too little text in step one, tweak `threshold=0.5` or `min_word_threshold=50`. Similarly, `bm25_threshold=1.2` can be raised/lowered for more or fewer chunks in step two.
+
+### One-Pass Combination?
+
+If your codebase or pipeline design allows applying multiple filters in one pass, you could do so. But often it’s simpler—and more transparent—to run them sequentially, analyzing each step’s result.
+
+**Bottom Line**: By **manually chaining** your filtering logic in two passes, you get powerful incremental control over the final content. First, remove “global” clutter with Pruning, then refine further with BM25-based query relevance—without incurring a second network crawl.
+
+---
+
+## 9. Common Pitfalls & Tips
+
+1. **No Markdown Output?**  
+   - Make sure the crawler actually retrieved HTML. If the site is heavily JS-based, you may need to enable dynamic rendering or wait for elements.  
+   - Check if your content filter is too aggressive. Lower thresholds or disable the filter to see if content reappears.
+
+2. **Performance Considerations**  
+   - Very large pages with multiple filters can be slower. Consider `cache_mode` to avoid re-downloading.  
+   - If your final use case is LLM ingestion, consider summarizing further or chunking big texts.
+
+3. **Take Advantage of `fit_markdown`**  
+   - Great for RAG pipelines, semantic search, or any scenario where extraneous boilerplate is unwanted.  
+   - Still verify the textual quality—some sites have crucial data in footers or sidebars.
+
+4. **Adjusting `html2text` Options**  
+   - If you see lots of raw HTML slipping into the text, turn on `escape_html`.  
+   - If code blocks look messy, experiment with `mark_code` or `handle_code_in_pre`.
+
+---
+
+## 10. Summary & Next Steps
+
+In this **Markdown Generation Basics** tutorial, you learned to:
+
+- Configure the **DefaultMarkdownGenerator** with HTML-to-text options.  
+- Select different HTML sources using the `content_source` parameter.  
+- Use **BM25ContentFilter** for query-specific extraction or **PruningContentFilter** for general noise removal.  
+- Distinguish between raw and filtered markdown (`fit_markdown`).  
+- Leverage the `MarkdownGenerationResult` object to handle different forms of output (citations, references, etc.).
+
+Now you can produce high-quality Markdown from any website, focusing on exactly the content you need—an essential step for powering AI models, summarization pipelines, or knowledge-base queries.
+
+**Last Updated**: 2025-01-01
+
+```
+
+
+## File: docs/md_v2/core/page-interaction.md
+
+```md
+# Page Interaction
+
+Crawl4AI provides powerful features for interacting with **dynamic** webpages, handling JavaScript execution, waiting for conditions, and managing multi-step flows. By combining **js_code**, **wait_for**, and certain **CrawlerRunConfig** parameters, you can:
+
+1. Click “Load More” buttons  
+2. Fill forms and submit them  
+3. Wait for elements or data to appear  
+4. Reuse sessions across multiple steps  
+
+Below is a quick overview of how to do it.
+
+---
+
+## 1. JavaScript Execution
+
+### Basic Execution
+
+**`js_code`** in **`CrawlerRunConfig`** accepts either a single JS string or a list of JS snippets.  
+**Example**: We’ll scroll to the bottom of the page, then optionally click a “Load More” button.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Single JS command
+    config = CrawlerRunConfig(
+        js_code="window.scrollTo(0, document.body.scrollHeight);"
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",  # Example site
+            config=config
+        )
+        print("Crawled length:", len(result.cleaned_html))
+
+    # Multiple commands
+    js_commands = [
+        "window.scrollTo(0, document.body.scrollHeight);",
+        # 'More' link on Hacker News
+        "document.querySelector('a.morelink')?.click();",  
+    ]
+    config = CrawlerRunConfig(js_code=js_commands)
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",  # Another pass
+            config=config
+        )
+        print("After scroll+click, length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Relevant `CrawlerRunConfig` params**:
+- **`js_code`**: A string or list of strings with JavaScript to run after the page loads.
+- **`js_only`**: If set to `True` on subsequent calls, indicates we’re continuing an existing session without a new full navigation.  
+- **`session_id`**: If you want to keep the same page across multiple calls, specify an ID.
+
+---
+
+## 2. Wait Conditions
+
+### 2.1 CSS-Based Waiting
+
+Sometimes, you just want to wait for a specific element to appear. For example:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        # Wait for at least 30 items on Hacker News
+        wait_for="css:.athing:nth-child(30)"  
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=config
+        )
+        print("We have at least 30 items loaded!")
+        # Rough check
+        print("Total items in HTML:", result.cleaned_html.count("athing"))  
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key param**:
+- **`wait_for="css:..."`**: Tells the crawler to wait until that CSS selector is present.
+
+### 2.2 JavaScript-Based Waiting
+
+For more complex conditions (e.g., waiting for content length to exceed a threshold), prefix `js:`:
+
+```python
+wait_condition = """() => {
+    const items = document.querySelectorAll('.athing');
+    return items.length > 50;  // Wait for at least 51 items
+}"""
+
+config = CrawlerRunConfig(wait_for=f"js:{wait_condition}")
+```
+
+**Behind the Scenes**: Crawl4AI keeps polling the JS function until it returns `true` or a timeout occurs.
+
+---
+
+## 3. Handling Dynamic Content
+
+Many modern sites require **multiple steps**: scrolling, clicking “Load More,” or updating via JavaScript. Below are typical patterns.
+
+### 3.1 Load More Example (Hacker News “More” Link)
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Step 1: Load initial Hacker News page
+    config = CrawlerRunConfig(
+        wait_for="css:.athing:nth-child(30)"  # Wait for 30 items
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=config
+        )
+        print("Initial items loaded.")
+
+        # Step 2: Let's scroll and click the "More" link
+        load_more_js = [
+            "window.scrollTo(0, document.body.scrollHeight);",
+            # The "More" link at page bottom
+            "document.querySelector('a.morelink')?.click();"  
+        ]
+        
+        next_page_conf = CrawlerRunConfig(
+            js_code=load_more_js,
+            wait_for="""js:() => {
+                return document.querySelectorAll('.athing').length > 30;
+            }""",
+            # Mark that we do not re-navigate, but run JS in the same session:
+            js_only=True,
+            session_id="hn_session"
+        )
+
+        # Re-use the same crawler session
+        result2 = await crawler.arun(
+            url="https://news.ycombinator.com",  # same URL but continuing session
+            config=next_page_conf
+        )
+        total_items = result2.cleaned_html.count("athing")
+        print("Items after load-more:", total_items)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key params**:
+- **`session_id="hn_session"`**: Keep the same page across multiple calls to `arun()`.
+- **`js_only=True`**: We’re not performing a full reload, just applying JS in the existing page.
+- **`wait_for`** with `js:`: Wait for item count to grow beyond 30.
+
+---
+
+### 3.2 Form Interaction
+
+If the site has a search or login form, you can fill fields and submit them with **`js_code`**. For instance, if GitHub had a local search form:
+
+```python
+js_form_interaction = """
+document.querySelector('#your-search').value = 'TypeScript commits';
+document.querySelector('form').submit();
+"""
+
+config = CrawlerRunConfig(
+    js_code=js_form_interaction,
+    wait_for="css:.commit"
+)
+result = await crawler.arun(url="https://github.com/search", config=config)
+```
+
+**In reality**: Replace IDs or classes with the real site’s form selectors.
+
+---
+
+## 4. Timing Control
+
+1. **`page_timeout`** (ms): Overall page load or script execution time limit.  
+2. **`delay_before_return_html`** (seconds): Wait an extra moment before capturing the final HTML.  
+3. **`mean_delay`** & **`max_range`**: If you call `arun_many()` with multiple URLs, these add a random pause between each request.
+
+**Example**:
+
+```python
+config = CrawlerRunConfig(
+    page_timeout=60000,  # 60s limit
+    delay_before_return_html=2.5
+)
+```
+
+---
+
+## 5. Multi-Step Interaction Example
+
+Below is a simplified script that does multiple “Load More” clicks on GitHub’s TypeScript commits page. It **re-uses** the same session to accumulate new commits each time. The code includes the relevant **`CrawlerRunConfig`** parameters you’d rely on.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def multi_page_commits():
+    browser_cfg = BrowserConfig(
+        headless=False,  # Visible for demonstration
+        verbose=True
+    )
+    session_id = "github_ts_commits"
+    
+    base_wait = """js:() => {
+        const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+        return commits.length > 0;
+    }"""
+
+    # Step 1: Load initial commits
+    config1 = CrawlerRunConfig(
+        wait_for=base_wait,
+        session_id=session_id,
+        cache_mode=CacheMode.BYPASS,
+        # Not using js_only yet since it's our first load
+    )
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        result = await crawler.arun(
+            url="https://github.com/microsoft/TypeScript/commits/main",
+            config=config1
+        )
+        print("Initial commits loaded. Count:", result.cleaned_html.count("commit"))
+
+        # Step 2: For subsequent pages, we run JS to click 'Next Page' if it exists
+        js_next_page = """
+        const selector = 'a[data-testid="pagination-next-button"]';
+        const button = document.querySelector(selector);
+        if (button) button.click();
+        """
+        
+        # Wait until new commits appear
+        wait_for_more = """js:() => {
+            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+            if (!window.firstCommit && commits.length>0) {
+                window.firstCommit = commits[0].textContent;
+                return false;
+            }
+            // If top commit changes, we have new commits
+            const topNow = commits[0]?.textContent.trim();
+            return topNow && topNow !== window.firstCommit;
+        }"""
+
+        for page in range(2):  # let's do 2 more "Next" pages
+            config_next = CrawlerRunConfig(
+                session_id=session_id,
+                js_code=js_next_page,
+                wait_for=wait_for_more,
+                js_only=True,       # We're continuing from the open tab
+                cache_mode=CacheMode.BYPASS
+            )
+            result2 = await crawler.arun(
+                url="https://github.com/microsoft/TypeScript/commits/main",
+                config=config_next
+            )
+            print(f"Page {page+2} commits count:", result2.cleaned_html.count("commit"))
+
+        # Optionally kill session
+        await crawler.crawler_strategy.kill_session(session_id)
+
+async def main():
+    await multi_page_commits()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**:
+
+- **`session_id`**: Keep the same page open.  
+- **`js_code`** + **`wait_for`** + **`js_only=True`**: We do partial refreshes, waiting for new commits to appear.  
+- **`cache_mode=CacheMode.BYPASS`** ensures we always see fresh data each step.
+
+---
+
+## 6. Combine Interaction with Extraction
+
+Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+schema = {
+    "name": "Commits",
+    "baseSelector": "li.Box-sc-g0xbh4-0",
+    "fields": [
+        {"name": "title", "selector": "h4.markdown-title", "type": "text"}
+    ]
+}
+config = CrawlerRunConfig(
+    session_id="ts_commits_session",
+    js_code=js_next_page,
+    wait_for=wait_for_more,
+    extraction_strategy=JsonCssExtractionStrategy(schema)
+)
+```
+
+When done, check `result.extracted_content` for the JSON.
+
+---
+
+## 7. Relevant `CrawlerRunConfig` Parameters
+
+Below are the key interaction-related parameters in `CrawlerRunConfig`. For a full list, see [Configuration Parameters](../api/parameters.md).
+
+- **`js_code`**: JavaScript to run after initial load.  
+- **`js_only`**: If `True`, no new page navigation—only JS in the existing session.  
+- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.  
+- **`session_id`**: Reuse the same page across calls.  
+- **`cache_mode`**: Whether to read/write from the cache or bypass.  
+- **`remove_overlay_elements`**: Remove certain popups automatically.  
+- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or “human-like” interactions.
+
+---
+
+## 8. Conclusion
+
+Crawl4AI’s **page interaction** features let you:
+
+1. **Execute JavaScript** for scrolling, clicks, or form filling.  
+2. **Wait** for CSS or custom JS conditions before capturing data.  
+3. **Handle** multi-step flows (like “Load More”) with partial reloads or persistent sessions.  
+4. Combine with **structured extraction** for dynamic sites.
+
+With these tools, you can scrape modern, interactive webpages confidently. For advanced hooking, user simulation, or in-depth config, check the [API reference](../api/parameters.md) or related advanced docs. Happy scripting!
+```
+
+
+## File: docs/md_v2/core/quickstart.md
+
+```md
+# Getting Started with Crawl4AI
+
+Welcome to **Crawl4AI**, an open-source LLM-friendly Web Crawler & Scraper. In this tutorial, you’ll:
+
+1. Run your **first crawl** using minimal configuration.  
+2. Generate **Markdown** output (and learn how it’s influenced by content filters).  
+3. Experiment with a simple **CSS-based extraction** strategy.  
+4. See a glimpse of **LLM-based extraction** (including open-source and closed-source model options).  
+5. Crawl a **dynamic** page that loads content via JavaScript.
+
+---
+
+## 1. Introduction
+
+Crawl4AI provides:
+
+- An asynchronous crawler, **`AsyncWebCrawler`**.  
+- Configurable browser and run settings via **`BrowserConfig`** and **`CrawlerRunConfig`**.  
+- Automatic HTML-to-Markdown conversion via **`DefaultMarkdownGenerator`** (supports optional filters).  
+- Multiple extraction strategies (LLM-based or “traditional” CSS/XPath-based).
+
+By the end of this guide, you’ll have performed a basic crawl, generated Markdown, tried out two extraction strategies, and crawled a dynamic page that uses “Load More” buttons or JavaScript updates.
+
+---
+
+## 2. Your First Crawl
+
+Here’s a minimal Python script that creates an **`AsyncWebCrawler`**, fetches a webpage, and prints the first 300 characters of its Markdown output:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com")
+        print(result.markdown[:300])  # Print first 300 chars
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What’s happening?**
+- **`AsyncWebCrawler`** launches a headless browser (Chromium by default).
+- It fetches `https://example.com`.
+- Crawl4AI automatically converts the HTML into Markdown.
+
+You now have a simple, working crawl!
+
+---
+
+## 3. Basic Configuration (Light Introduction)
+
+Crawl4AI’s crawler can be heavily customized using two main classes:
+
+1. **`BrowserConfig`**: Controls browser behavior (headless or full UI, user agent, JavaScript toggles, etc.).  
+2. **`CrawlerRunConfig`**: Controls how each crawl runs (caching, extraction, timeouts, hooking, etc.).
+
+Below is an example with minimal usage:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def main():
+    browser_conf = BrowserConfig(headless=True)  # or False to see the browser
+    run_conf = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler(config=browser_conf) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_conf
+        )
+        print(result.markdown)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS`
+
+We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.
+
+---
+
+## 4. Generating Markdown Output
+
+By default, Crawl4AI automatically generates Markdown from each crawled page. However, the exact output depends on whether you specify a **markdown generator** or **content filter**.
+
+- **`result.markdown`**:  
+  The direct HTML-to-Markdown conversion.  
+- **`result.markdown.fit_markdown`**:  
+  The same content after applying any configured **content filter** (e.g., `PruningContentFilter`).
+
+### Example: Using a Filter with `DefaultMarkdownGenerator`
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+md_generator = DefaultMarkdownGenerator(
+    content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
+)
+
+config = CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS,
+    markdown_generator=md_generator
+)
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun("https://news.ycombinator.com", config=config)
+    print("Raw Markdown length:", len(result.markdown.raw_markdown))
+    print("Fit Markdown length:", len(result.markdown.fit_markdown))
+```
+
+**Note**: If you do **not** specify a content filter or markdown generator, you’ll typically see only the raw Markdown. `PruningContentFilter` may adds around `50ms` in processing time. We’ll dive deeper into these strategies in a dedicated **Markdown Generation** tutorial.
+
+---
+
+## 5. Simple Data Extraction (CSS-based)
+
+Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. Below is a minimal CSS-based example:
+
+> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import LLMConfig
+
+# Generate a schema (one-time cost)
+html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
+
+# Using OpenAI (requires API token)
+schema = JsonCssExtractionStrategy.generate_schema(
+    html,
+    llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")  # Required for OpenAI
+)
+
+# Or using Ollama (open source, no token needed)
+schema = JsonCssExtractionStrategy.generate_schema(
+    html,
+    llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)  # Not needed for Ollama
+)
+
+# Use the schema for fast, repeated extractions
+strategy = JsonCssExtractionStrategy(schema)
+```
+
+For a complete guide on schema generation and advanced usage, see [No-LLM Extraction Strategies](../extraction/no-llm-strategies.md).
+
+Here's a basic extraction example:
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    schema = {
+        "name": "Example Items",
+        "baseSelector": "div.item",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+        ]
+    }
+
+    raw_html = "<div class='item'><h2>Item 1</h2><a href='https://example.com/item1'>Link 1</a></div>"
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="raw://" + raw_html,
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                extraction_strategy=JsonCssExtractionStrategy(schema)
+            )
+        )
+        # The JSON output is stored in 'extracted_content'
+        data = json.loads(result.extracted_content)
+        print(data)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Why is this helpful?**
+- Great for repetitive page structures (e.g., item listings, articles).
+- No AI usage or costs.
+- The crawler returns a JSON string you can parse or store.
+
+> Tips: You can pass raw HTML to the crawler instead of a URL. To do so, prefix the HTML with `raw://`.
+
+---
+
+## 6. Simple Data Extraction (LLM-based)
+
+For more complex or irregular pages, a language model can parse text intelligently into a structure you define. Crawl4AI supports **open-source** or **closed-source** providers:
+
+- **Open-Source Models** (e.g., `ollama/llama3.3`, `no_token`)  
+- **OpenAI Models** (e.g., `openai/gpt-4`, requires `api_token`)  
+- Or any provider supported by the underlying library
+
+Below is an example using **open-source** style (no token) and closed-source:
+
+```python
+import os
+import json
+import asyncio
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
+        return
+
+    browser_config = BrowserConfig(headless=True)
+
+    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+    if extra_headers:
+        extra_args["extra_headers"] = extra_headers
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=1,
+        page_timeout=80000,
+        extraction_strategy=LLMExtractionStrategy(
+            llm_config = LLMConfig(provider=provider,api_token=api_token),
+            schema=OpenAIModelFee.model_json_schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content.""",
+            extra_args=extra_args,
+        ),
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://openai.com/api/pricing/", config=crawler_config
+        )
+        print(result.extracted_content)
+
+if __name__ == "__main__":
+
+    asyncio.run(
+        extract_structured_data_using_llm(
+            provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
+        )
+    )
+```
+
+**What’s happening?**
+- We define a Pydantic schema (`PricingInfo`) describing the fields we want.
+- The LLM extraction strategy uses that schema and your instructions to transform raw text into structured JSON.
+- Depending on the **provider** and **api_token**, you can use local models or a remote API.
+
+---
+
+## 7. Multi-URL Concurrency (Preview)
+
+If you need to crawl multiple URLs in **parallel**, you can use `arun_many()`. By default, Crawl4AI employs a **MemoryAdaptiveDispatcher**, automatically adjusting concurrency based on system resources. Here’s a quick glimpse:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def quick_parallel_example():
+    urls = [
+        "https://example.com/page1",
+        "https://example.com/page2",
+        "https://example.com/page3"
+    ]
+    
+    run_conf = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=True  # Enable streaming mode
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        # Stream results as they complete
+        async for result in await crawler.arun_many(urls, config=run_conf):
+            if result.success:
+                print(f"[OK] {result.url}, length: {len(result.markdown.raw_markdown)}")
+            else:
+                print(f"[ERROR] {result.url} => {result.error_message}")
+
+        # Or get all results at once (default behavior)
+        run_conf = run_conf.clone(stream=False)
+        results = await crawler.arun_many(urls, config=run_conf)
+        for res in results:
+            if res.success:
+                print(f"[OK] {res.url}, length: {len(res.markdown.raw_markdown)}")
+            else:
+                print(f"[ERROR] {res.url} => {res.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(quick_parallel_example())
+```
+
+The example above shows two ways to handle multiple URLs:
+1. **Streaming mode** (`stream=True`): Process results as they become available using `async for`
+2. **Batch mode** (`stream=False`): Wait for all results to complete
+
+For more advanced concurrency (e.g., a **semaphore-based** approach, **adaptive memory usage throttling**, or customized rate limiting), see [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md).
+
+---
+
+## 8. Dynamic Content Example
+
+Some sites require multiple “page clicks” or dynamic JavaScript updates. Below is an example showing how to **click** a “Next Page” button and wait for new commits to load on GitHub, using **`BrowserConfig`** and **`CrawlerRunConfig`**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def extract_structured_data_using_css_extractor():
+    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+    schema = {
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .w-tab-content > div",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    js_click_tabs = """
+    (async () => {
+        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+        for(let tab of tabs) {
+            tab.scrollIntoView();
+            tab.click();
+            await new Promise(r => setTimeout(r, 500));
+        }
+    })();
+    """
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        js_code=[js_click_tabs],
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.kidocode.com/degrees/technology", config=crawler_config
+        )
+
+        companies = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(companies)} companies")
+        print(json.dumps(companies[0], indent=2))
+
+async def main():
+    await extract_structured_data_using_css_extractor()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**:
+
+- **`BrowserConfig(headless=False)`**: We want to watch it click “Next Page.”  
+- **`CrawlerRunConfig(...)`**: We specify the extraction strategy, pass `session_id` to reuse the same page.  
+- **`js_code`** and **`wait_for`** are used for subsequent pages (`page > 0`) to click the “Next” button and wait for new commits to load.  
+- **`js_only=True`** indicates we’re not re-navigating but continuing the existing session.  
+- Finally, we call `kill_session()` to clean up the page and browser session.
+
+---
+
+## 9. Next Steps
+
+Congratulations! You have:
+
+1. Performed a basic crawl and printed Markdown.  
+2. Used **content filters** with a markdown generator.  
+3. Extracted JSON via **CSS** or **LLM** strategies.  
+4. Handled **dynamic** pages with JavaScript triggers.
+
+If you’re ready for more, check out:
+
+- **Installation**: A deeper dive into advanced installs, Docker usage (experimental), or optional dependencies.  
+- **Hooks & Auth**: Learn how to run custom JavaScript or handle logins with cookies, local storage, etc.  
+- **Deployment**: Explore ephemeral testing in Docker or plan for the upcoming stable Docker release.  
+- **Browser Management**: Delve into user simulation, stealth modes, and concurrency best practices.  
+
+Crawl4AI is a powerful, flexible tool. Enjoy building out your scrapers, data pipelines, or AI-driven extraction flows. Happy crawling!
+```
+
+
+## File: docs/md_v2/core/simple-crawling.md
+
+```md
+# Simple Crawling
+
+This guide covers the basics of web crawling with Crawl4AI. You'll learn how to set up a crawler, make your first request, and understand the response.
+
+## Basic Usage
+
+Set up a simple crawl using `BrowserConfig` and `CrawlerRunConfig`:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+
+async def main():
+    browser_config = BrowserConfig()  # Default browser configuration
+    run_config = CrawlerRunConfig()   # Default crawl run configuration
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_config
+        )
+        print(result.markdown)  # Print clean markdown content
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Understanding the Response
+
+The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details):
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    config=CrawlerRunConfig(fit_markdown=True)
+)
+
+# Different content formats
+print(result.html)         # Raw HTML
+print(result.cleaned_html) # Cleaned HTML
+print(result.markdown.raw_markdown) # Raw markdown from cleaned html
+print(result.markdown.fit_markdown) # Most relevant content in markdown
+
+# Check success status
+print(result.success)      # True if crawl succeeded
+print(result.status_code)  # HTTP status code (e.g., 200, 404)
+
+# Access extracted media and links
+print(result.media)        # Dictionary of found media (images, videos, audio)
+print(result.links)        # Dictionary of internal and external links
+```
+
+## Adding Basic Options
+
+Customize your crawl using `CrawlerRunConfig`:
+
+```python
+run_config = CrawlerRunConfig(
+    word_count_threshold=10,        # Minimum words per content block
+    exclude_external_links=True,    # Remove external links
+    remove_overlay_elements=True,   # Remove popups/modals
+    process_iframes=True           # Process iframe content
+)
+
+result = await crawler.arun(
+    url="https://example.com",
+    config=run_config
+)
+```
+
+## Handling Errors
+
+Always check if the crawl was successful:
+
+```python
+run_config = CrawlerRunConfig()
+result = await crawler.arun(url="https://example.com", config=run_config)
+
+if not result.success:
+    print(f"Crawl failed: {result.error_message}")
+    print(f"Status code: {result.status_code}")
+```
+
+## Logging and Debugging
+
+Enable verbose logging in `BrowserConfig`:
+
+```python
+browser_config = BrowserConfig(verbose=True)
+
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    run_config = CrawlerRunConfig()
+    result = await crawler.arun(url="https://example.com", config=run_config)
+```
+
+## Complete Example
+
+Here's a more comprehensive example demonstrating common usage patterns:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def main():
+    browser_config = BrowserConfig(verbose=True)
+    run_config = CrawlerRunConfig(
+        # Content filtering
+        word_count_threshold=10,
+        excluded_tags=['form', 'header'],
+        exclude_external_links=True,
+        
+        # Content processing
+        process_iframes=True,
+        remove_overlay_elements=True,
+        
+        # Cache control
+        cache_mode=CacheMode.ENABLED  # Use cache if available
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_config
+        )
+        
+        if result.success:
+            # Print clean content
+            print("Content:", result.markdown[:500])  # First 500 chars
+            
+            # Process images
+            for image in result.media["images"]:
+                print(f"Found image: {image['src']}")
+            
+            # Process links
+            for link in result.links["internal"]:
+                print(f"Internal link: {link['href']}")
+                
+        else:
+            print(f"Crawl failed: {result.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+```
+
+
+## File: docs/md_v2/advanced/advanced-features.md
+
+```md
+# Overview of Some Important Advanced Features 
+(Proxy, PDF, Screenshot, SSL, Headers, & Storage State)
+
+Crawl4AI offers multiple power-user features that go beyond simple crawling. This tutorial covers:
+
+1. **Proxy Usage**  
+2. **Capturing PDFs & Screenshots**  
+3. **Handling SSL Certificates**  
+4. **Custom Headers**  
+5. **Session Persistence & Local Storage**  
+6. **Robots.txt Compliance**  
+
+> **Prerequisites**  
+> - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md)  
+> - You know how to run or configure your Python environment with Playwright installed
+
+---
+
+## 1. Proxy Usage
+
+If you need to route your crawl traffic through a proxy—whether for IP rotation, geo-testing, or privacy—Crawl4AI supports it via `BrowserConfig.proxy_config`.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    browser_cfg = BrowserConfig(
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "myuser",
+            "password": "mypass",
+        },
+        headless=True
+    )
+    crawler_cfg = CrawlerRunConfig(
+        verbose=True
+    )
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        result = await crawler.arun(
+            url="https://www.whatismyip.com/",
+            config=crawler_cfg
+        )
+        if result.success:
+            print("[OK] Page fetched via proxy.")
+            print("Page HTML snippet:", result.html[:200])
+        else:
+            print("[ERROR]", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**  
+- **`proxy_config`** expects a dict with `server` and optional auth credentials.  
+- Many commercial proxies provide an HTTP/HTTPS “gateway” server that you specify in `server`.  
+- If your proxy doesn’t need auth, omit `username`/`password`.
+
+---
+
+## 2. Capturing PDFs & Screenshots
+
+Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI can do both in one pass:
+
+```python
+import os, asyncio
+from base64 import b64decode
+from crawl4ai import AsyncWebCrawler, CacheMode
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
+            cache_mode=CacheMode.BYPASS,
+            pdf=True,
+            screenshot=True
+        )
+        
+        if result.success:
+            # Save screenshot
+            if result.screenshot:
+                with open("wikipedia_screenshot.png", "wb") as f:
+                    f.write(b64decode(result.screenshot))
+            
+            # Save PDF
+            if result.pdf:
+                with open("wikipedia_page.pdf", "wb") as f:
+                    f.write(result.pdf)
+            
+            print("[OK] PDF & screenshot captured.")
+        else:
+            print("[ERROR]", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Why PDF + Screenshot?**  
+- Large or complex pages can be slow or error-prone with “traditional” full-page screenshots.  
+- Exporting a PDF is more reliable for very long pages. Crawl4AI automatically converts the first PDF page into an image if you request both.  
+
+**Relevant Parameters**  
+- **`pdf=True`**: Exports the current page as a PDF (base64-encoded in `result.pdf`).  
+- **`screenshot=True`**: Creates a screenshot (base64-encoded in `result.screenshot`).  
+- **`scan_full_page`** or advanced hooking can further refine how the crawler captures content.
+
+---
+
+## 3. Handling SSL Certificates
+
+If you need to verify or export a site’s SSL certificate—for compliance, debugging, or data analysis—Crawl4AI can fetch it during the crawl:
+
+```python
+import asyncio, os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    tmp_dir = os.path.join(os.getcwd(), "tmp")
+    os.makedirs(tmp_dir, exist_ok=True)
+    
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=config)
+        
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+            print("\nCertificate Information:")
+            print(f"Issuer (CN): {cert.issuer.get('CN', '')}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+
+            # Export in multiple formats:
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))
+            cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))
+            cert.to_der(os.path.join(tmp_dir, "certificate.der"))
+            
+            print("\nCertificate exported to JSON/PEM/DER in 'tmp' folder.")
+        else:
+            print("[ERROR] No certificate or crawl failed.")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**  
+- **`fetch_ssl_certificate=True`** triggers certificate retrieval.  
+- `result.ssl_certificate` includes methods (`to_json`, `to_pem`, `to_der`) for saving in various formats (handy for server config, Java keystores, etc.).
+
+---
+
+## 4. Custom Headers
+
+Sometimes you need to set custom headers (e.g., language preferences, authentication tokens, or specialized user-agent strings). You can do this in multiple ways:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    # Option 1: Set headers at the crawler strategy level
+    crawler1 = AsyncWebCrawler(
+        # The underlying strategy can accept headers in its constructor
+        crawler_strategy=None  # We'll override below for clarity
+    )
+    crawler1.crawler_strategy.update_user_agent("MyCustomUA/1.0")
+    crawler1.crawler_strategy.set_custom_headers({
+        "Accept-Language": "fr-FR,fr;q=0.9"
+    })
+    result1 = await crawler1.arun("https://www.example.com")
+    print("Example 1 result success:", result1.success)
+
+    # Option 2: Pass headers directly to `arun()`
+    crawler2 = AsyncWebCrawler()
+    result2 = await crawler2.arun(
+        url="https://www.example.com",
+        headers={"Accept-Language": "es-ES,es;q=0.9"}
+    )
+    print("Example 2 result success:", result2.success)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Notes**  
+- Some sites may react differently to certain headers (e.g., `Accept-Language`).  
+- If you need advanced user-agent randomization or client hints, see [Identity-Based Crawling (Anti-Bot)](./identity-based-crawling.md) or use `UserAgentGenerator`.
+
+---
+
+## 5. Session Persistence & Local Storage
+
+Crawl4AI can preserve cookies and localStorage so you can continue where you left off—ideal for logging into sites or skipping repeated auth flows.
+
+### 5.1 `storage_state`
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    storage_dict = {
+        "cookies": [
+            {
+                "name": "session",
+                "value": "abcd1234",
+                "domain": "example.com",
+                "path": "/",
+                "expires": 1699999999.0,
+                "httpOnly": False,
+                "secure": False,
+                "sameSite": "None"
+            }
+        ],
+        "origins": [
+            {
+                "origin": "https://example.com",
+                "localStorage": [
+                    {"name": "token", "value": "my_auth_token"}
+                ]
+            }
+        ]
+    }
+
+    # Provide the storage state as a dictionary to start "already logged in"
+    async with AsyncWebCrawler(
+        headless=True,
+        storage_state=storage_dict
+    ) as crawler:
+        result = await crawler.arun("https://example.com/protected")
+        if result.success:
+            print("Protected page content length:", len(result.html))
+        else:
+            print("Failed to crawl protected page")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 5.2 Exporting & Reusing State
+
+You can sign in once, export the browser context, and reuse it later—without re-entering credentials.
+
+- **`await context.storage_state(path="my_storage.json")`**: Exports cookies, localStorage, etc. to a file.  
+- Provide `storage_state="my_storage.json"` on subsequent runs to skip the login step.
+
+**See**: [Detailed session management tutorial](./session-management.md) or [Explanations → Browser Context & Managed Browser](./identity-based-crawling.md) for more advanced scenarios (like multi-step logins, or capturing after interactive pages).
+
+---
+
+## 6. Robots.txt Compliance
+
+Crawl4AI supports respecting robots.txt rules with efficient caching:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Enable robots.txt checking in config
+    config = CrawlerRunConfig(
+        check_robots_txt=True  # Will check and respect robots.txt rules
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            "https://example.com",
+            config=config
+        )
+        
+        if not result.success and result.status_code == 403:
+            print("Access denied by robots.txt")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**
+- Robots.txt files are cached locally for efficiency
+- Cache is stored in `~/.crawl4ai/robots/robots_cache.db`
+- Cache has a default TTL of 7 days
+- If robots.txt can't be fetched, crawling is allowed
+- Returns 403 status code if URL is disallowed
+
+---
+
+## Putting It All Together
+
+Here’s a snippet that combines multiple “advanced” features (proxy, PDF, screenshot, SSL, custom headers, and session reuse) into one run. Normally, you’d tailor each setting to your project’s needs.
+
+```python
+import os, asyncio
+from base64 import b64decode
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def main():
+    # 1. Browser config with proxy + headless
+    browser_cfg = BrowserConfig(
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "myuser",
+            "password": "mypass",
+        },
+        headless=True,
+    )
+
+    # 2. Crawler config with PDF, screenshot, SSL, custom headers, and ignoring caches
+    crawler_cfg = CrawlerRunConfig(
+        pdf=True,
+        screenshot=True,
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS,
+        headers={"Accept-Language": "en-US,en;q=0.8"},
+        storage_state="my_storage.json",  # Reuse session from a previous sign-in
+        verbose=True,
+    )
+
+    # 3. Crawl
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        result = await crawler.arun(
+            url = "https://secure.example.com/protected", 
+            config=crawler_cfg
+        )
+        
+        if result.success:
+            print("[OK] Crawled the secure page. Links found:", len(result.links.get("internal", [])))
+            
+            # Save PDF & screenshot
+            if result.pdf:
+                with open("result.pdf", "wb") as f:
+                    f.write(b64decode(result.pdf))
+            if result.screenshot:
+                with open("result.png", "wb") as f:
+                    f.write(b64decode(result.screenshot))
+            
+            # Check SSL cert
+            if result.ssl_certificate:
+                print("SSL Issuer CN:", result.ssl_certificate.issuer.get("CN", ""))
+        else:
+            print("[ERROR]", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## Conclusion & Next Steps
+
+You’ve now explored several **advanced** features:
+
+- **Proxy Usage**  
+- **PDF & Screenshot** capturing for large or critical pages  
+- **SSL Certificate** retrieval & exporting  
+- **Custom Headers** for language or specialized requests  
+- **Session Persistence** via storage state
+- **Robots.txt Compliance**
+
+With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.
+
+**Last Updated**: 2025-01-01
+```
+
+
+## File: docs/md_v2/advanced/crawl-dispatcher.md
+
+```md
+# Crawl Dispatcher
+
+We’re excited to announce a **Crawl Dispatcher** module that can handle **thousands** of crawling tasks simultaneously. By efficiently managing system resources (memory, CPU, network), this dispatcher ensures high-performance data extraction at scale. It also provides **real-time monitoring** of each crawler’s status, memory usage, and overall progress.
+
+Stay tuned—this feature is **coming soon** in an upcoming release of Crawl4AI! For the latest news, keep an eye on our changelogs and follow [@unclecode](https://twitter.com/unclecode) on X.
+
+Below is a **sample** of how the dispatcher’s performance monitor might look in action:
+
+![Crawl Dispatcher Performance Monitor](../assets/images/dispatcher.png)
+
+
+We can’t wait to bring you this streamlined, **scalable** approach to multi-URL crawling—**watch this space** for updates!
+```
+
+
+## File: docs/md_v2/advanced/file-downloading.md
+
+```md
+# Download Handling in Crawl4AI
+
+This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files.
+
+## Enabling Downloads
+
+To enable downloads, set the `accept_downloads` parameter in the `BrowserConfig` object and pass it to the crawler.
+
+```python
+from crawl4ai.async_configs import BrowserConfig, AsyncWebCrawler
+
+async def main():
+    config = BrowserConfig(accept_downloads=True)  # Enable downloads globally
+    async with AsyncWebCrawler(config=config) as crawler:
+        # ... your crawling logic ...
+
+asyncio.run(main())
+```
+
+## Specifying Download Location
+
+Specify the download directory using the `downloads_path` attribute in the `BrowserConfig` object. If not provided, Crawl4AI defaults to creating a "downloads" directory inside the `.crawl4ai` folder in your home directory.
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+import os
+
+downloads_path = os.path.join(os.getcwd(), "my_downloads")  # Custom download path
+os.makedirs(downloads_path, exist_ok=True)
+
+config = BrowserConfig(accept_downloads=True, downloads_path=downloads_path)
+
+async def main():
+    async with AsyncWebCrawler(config=config) as crawler:
+        result = await crawler.arun(url="https://example.com")
+        # ...
+```
+
+## Triggering Downloads
+
+Downloads are typically triggered by user interactions on a web page, such as clicking a download button. Use `js_code` in `CrawlerRunConfig` to simulate these actions and `wait_for` to allow sufficient time for downloads to start.
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+
+config = CrawlerRunConfig(
+    js_code="""
+        const downloadLink = document.querySelector('a[href$=".exe"]');
+        if (downloadLink) {
+            downloadLink.click();
+        }
+    """,
+    wait_for=5  # Wait 5 seconds for the download to start
+)
+
+result = await crawler.arun(url="https://www.python.org/downloads/", config=config)
+```
+
+## Accessing Downloaded Files
+
+The `downloaded_files` attribute of the `CrawlResult` object contains paths to downloaded files.
+
+```python
+if result.downloaded_files:
+    print("Downloaded files:")
+    for file_path in result.downloaded_files:
+        print(f"- {file_path}")
+        file_size = os.path.getsize(file_path)
+        print(f"- File size: {file_size} bytes")
+else:
+    print("No files downloaded.")
+```
+
+## Example: Downloading Multiple Files
+
+```python
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+import os
+from pathlib import Path
+
+async def download_multiple_files(url: str, download_path: str):
+    config = BrowserConfig(accept_downloads=True, downloads_path=download_path)
+    async with AsyncWebCrawler(config=config) as crawler:
+        run_config = CrawlerRunConfig(
+            js_code="""
+                const downloadLinks = document.querySelectorAll('a[download]');
+                for (const link of downloadLinks) {
+                    link.click();
+                    // Delay between clicks
+                    await new Promise(r => setTimeout(r, 2000));  
+                }
+            """,
+            wait_for=10  # Wait for all downloads to start
+        )
+        result = await crawler.arun(url=url, config=run_config)
+
+        if result.downloaded_files:
+            print("Downloaded files:")
+            for file in result.downloaded_files:
+                print(f"- {file}")
+        else:
+            print("No files downloaded.")
+
+# Usage
+download_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
+os.makedirs(download_path, exist_ok=True)
+
+asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path))
+```
+
+## Important Considerations
+
+- **Browser Context:** Downloads are managed within the browser context. Ensure `js_code` correctly targets the download triggers on the webpage.
+- **Timing:** Use `wait_for` in `CrawlerRunConfig` to manage download timing.
+- **Error Handling:** Handle errors to manage failed downloads or incorrect paths gracefully.
+- **Security:** Scan downloaded files for potential security threats before use.
+
+This revised guide ensures consistency with the `Crawl4AI` codebase by using `BrowserConfig` and `CrawlerRunConfig` for all download-related configurations. Let me know if further adjustments are needed!
+```
+
+
+## File: docs/md_v2/advanced/hooks-auth.md
+
+```md
+# Hooks & Auth in AsyncWebCrawler
+
+Crawl4AI’s **hooks** let you customize the crawler at specific points in the pipeline:
+
+1. **`on_browser_created`** – After browser creation.  
+2. **`on_page_context_created`** – After a new context & page are created.  
+3. **`before_goto`** – Just before navigating to a page.  
+4. **`after_goto`** – Right after navigation completes.  
+5. **`on_user_agent_updated`** – Whenever the user agent changes.  
+6. **`on_execution_started`** – Once custom JavaScript execution begins.  
+7. **`before_retrieve_html`** – Just before the crawler retrieves final HTML.  
+8. **`before_return_html`** – Right before returning the HTML content.
+
+**Important**: Avoid heavy tasks in `on_browser_created` since you don’t yet have a page context. If you need to *log in*, do so in **`on_page_context_created`**.
+
+> note "Important Hook Usage Warning"
+    **Avoid Misusing Hooks**: Do not manipulate page objects in the wrong hook or at the wrong time, as it can crash the pipeline or produce incorrect results. A common mistake is attempting to handle authentication prematurely—such as creating or closing pages in `on_browser_created`. 
+
+>   **Use the Right Hook for Auth**: If you need to log in or set tokens, use `on_page_context_created`. This ensures you have a valid page/context to work with, without disrupting the main crawling flow.
+
+>    **Identity-Based Crawling**: For robust auth, consider identity-based crawling (or passing a session ID) to preserve state. Run your initial login steps in a separate, well-defined process, then feed that session to your main crawl—rather than shoehorning complex authentication into early hooks. Check out [Identity-Based Crawling](../advanced/identity-based-crawling.md) for more details.
+
+>    **Be Cautious**: Overwriting or removing elements in the wrong hook can compromise the final crawl. Keep hooks focused on smaller tasks (like route filters, custom headers), and let your main logic (crawling, data extraction) proceed normally.
+
+
+Below is an example demonstration.
+
+---
+
+## Example: Using Hooks in AsyncWebCrawler
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from playwright.async_api import Page, BrowserContext
+
+async def main():
+    print("🔗 Hooks Example: Demonstrating recommended usage")
+
+    # 1) Configure the browser
+    browser_config = BrowserConfig(
+        headless=True,
+        verbose=True
+    )
+
+    # 2) Configure the crawler run
+    crawler_run_config = CrawlerRunConfig(
+        js_code="window.scrollTo(0, document.body.scrollHeight);",
+        wait_for="body",
+        cache_mode=CacheMode.BYPASS
+    )
+
+    # 3) Create the crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    #
+    # Define Hook Functions
+    #
+
+    async def on_browser_created(browser, **kwargs):
+        # Called once the browser instance is created (but no pages or contexts yet)
+        print("[HOOK] on_browser_created - Browser created successfully!")
+        # Typically, do minimal setup here if needed
+        return browser
+
+    async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
+        # Called right after a new page + context are created (ideal for auth or route config).
+        print("[HOOK] on_page_context_created - Setting up page & context.")
+        
+        # Example 1: Route filtering (e.g., block images)
+        async def route_filter(route):
+            if route.request.resource_type == "image":
+                print(f"[HOOK] Blocking image request: {route.request.url}")
+                await route.abort()
+            else:
+                await route.continue_()
+
+        await context.route("**", route_filter)
+
+        # Example 2: (Optional) Simulate a login scenario
+        # (We do NOT create or close pages here, just do quick steps if needed)
+        # e.g., await page.goto("https://example.com/login")
+        # e.g., await page.fill("input[name='username']", "testuser")
+        # e.g., await page.fill("input[name='password']", "password123")
+        # e.g., await page.click("button[type='submit']")
+        # e.g., await page.wait_for_selector("#welcome")
+        # e.g., await context.add_cookies([...])
+        # Then continue
+
+        # Example 3: Adjust the viewport
+        await page.set_viewport_size({"width": 1080, "height": 600})
+        return page
+
+    async def before_goto(
+        page: Page, context: BrowserContext, url: str, **kwargs
+    ):
+        # Called before navigating to each URL.
+        print(f"[HOOK] before_goto - About to navigate: {url}")
+        # e.g., inject custom headers
+        await page.set_extra_http_headers({
+            "Custom-Header": "my-value"
+        })
+        return page
+
+    async def after_goto(
+        page: Page, context: BrowserContext, 
+        url: str, response, **kwargs
+    ):
+        # Called after navigation completes.
+        print(f"[HOOK] after_goto - Successfully loaded: {url}")
+        # e.g., wait for a certain element if we want to verify
+        try:
+            await page.wait_for_selector('.content', timeout=1000)
+            print("[HOOK] Found .content element!")
+        except:
+            print("[HOOK] .content not found, continuing anyway.")
+        return page
+
+    async def on_user_agent_updated(
+        page: Page, context: BrowserContext, 
+        user_agent: str, **kwargs
+    ):
+        # Called whenever the user agent updates.
+        print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
+        return page
+
+    async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
+        # Called after custom JavaScript execution begins.
+        print("[HOOK] on_execution_started - JS code is running!")
+        return page
+
+    async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
+        # Called before final HTML retrieval.
+        print("[HOOK] before_retrieve_html - We can do final actions")
+        # Example: Scroll again
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+        return page
+
+    async def before_return_html(
+        page: Page, context: BrowserContext, html: str, **kwargs
+    ):
+        # Called just before returning the HTML in the result.
+        print(f"[HOOK] before_return_html - HTML length: {len(html)}")
+        return page
+
+    #
+    # Attach Hooks
+    #
+
+    crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
+    crawler.crawler_strategy.set_hook(
+        "on_page_context_created", on_page_context_created
+    )
+    crawler.crawler_strategy.set_hook("before_goto", before_goto)
+    crawler.crawler_strategy.set_hook("after_goto", after_goto)
+    crawler.crawler_strategy.set_hook(
+        "on_user_agent_updated", on_user_agent_updated
+    )
+    crawler.crawler_strategy.set_hook(
+        "on_execution_started", on_execution_started
+    )
+    crawler.crawler_strategy.set_hook(
+        "before_retrieve_html", before_retrieve_html
+    )
+    crawler.crawler_strategy.set_hook(
+        "before_return_html", before_return_html
+    )
+
+    await crawler.start()
+
+    # 4) Run the crawler on an example page
+    url = "https://example.com"
+    result = await crawler.arun(url, config=crawler_run_config)
+    
+    if result.success:
+        print("\nCrawled URL:", result.url)
+        print("HTML length:", len(result.html))
+    else:
+        print("Error:", result.error_message)
+
+    await crawler.close()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## Hook Lifecycle Summary
+
+1. **`on_browser_created`**:  
+   - Browser is up, but **no** pages or contexts yet.  
+   - Light setup only—don’t try to open or close pages here (that belongs in `on_page_context_created`).
+
+2. **`on_page_context_created`**:  
+   - Perfect for advanced **auth** or route blocking.  
+   - You have a **page** + **context** ready but haven’t navigated to the target URL yet.
+
+3. **`before_goto`**:  
+   - Right before navigation. Typically used for setting **custom headers** or logging the target URL.
+
+4. **`after_goto`**:  
+   - After page navigation is done. Good place for verifying content or waiting on essential elements. 
+
+5. **`on_user_agent_updated`**:  
+   - Whenever the user agent changes (for stealth or different UA modes).
+
+6. **`on_execution_started`**:  
+   - If you set `js_code` or run custom scripts, this runs once your JS is about to start.
+
+7. **`before_retrieve_html`**:  
+   - Just before the final HTML snapshot is taken. Often you do a final scroll or lazy-load triggers here.
+
+8. **`before_return_html`**:  
+   - The last hook before returning HTML to the `CrawlResult`. Good for logging HTML length or minor modifications.
+
+---
+
+## When to Handle Authentication
+
+**Recommended**: Use **`on_page_context_created`** if you need to:
+
+- Navigate to a login page or fill forms
+- Set cookies or localStorage tokens
+- Block resource routes to avoid ads
+
+This ensures the newly created context is under your control **before** `arun()` navigates to the main URL.
+
+---
+
+## Additional Considerations
+
+- **Session Management**: If you want multiple `arun()` calls to reuse a single session, pass `session_id=` in your `CrawlerRunConfig`. Hooks remain the same.  
+- **Performance**: Hooks can slow down crawling if they do heavy tasks. Keep them concise.  
+- **Error Handling**: If a hook fails, the overall crawl might fail. Catch exceptions or handle them gracefully.  
+- **Concurrency**: If you run `arun_many()`, each URL triggers these hooks in parallel. Ensure your hooks are thread/async-safe.
+
+---
+
+## Conclusion
+
+Hooks provide **fine-grained** control over:
+
+- **Browser** creation (light tasks only)
+- **Page** and **context** creation (auth, route blocking)
+- **Navigation** phases
+- **Final HTML** retrieval
+
+Follow the recommended usage:
+- **Login** or advanced tasks in `on_page_context_created`  
+- **Custom headers** or logs in `before_goto` / `after_goto`  
+- **Scrolling** or final checks in `before_retrieve_html` / `before_return_html`
+
+
+```
+
+
+## File: docs/md_v2/advanced/identity-based-crawling.md
+
+```md
+# Preserve Your Identity with Crawl4AI
+
+Crawl4AI empowers you to navigate and interact with the web using your **authentic digital identity**, ensuring you’re recognized as a human and not mistaken for a bot. This tutorial covers:
+
+1. **Managed Browsers** – The recommended approach for persistent profiles and identity-based crawling.  
+2. **Magic Mode** – A simplified fallback solution for quick automation without persistent identity.
+
+---
+
+## 1. Managed Browsers: Your Digital Identity Solution
+
+**Managed Browsers** let developers create and use **persistent browser profiles**. These profiles store local storage, cookies, and other session data, letting you browse as your **real self**—complete with logins, preferences, and cookies.
+
+### Key Benefits
+
+- **Authentic Browsing Experience**: Retain session data and browser fingerprints as though you’re a normal user.  
+- **Effortless Configuration**: Once you log in or solve CAPTCHAs in your chosen data directory, you can re-run crawls without repeating those steps.  
+- **Empowered Data Access**: If you can see the data in your own browser, you can automate its retrieval with your genuine identity.
+
+---
+
+Below is a **partial update** to your **Managed Browsers** tutorial, specifically the section about **creating a user-data directory** using **Playwright’s Chromium** binary rather than a system-wide Chrome/Edge. We’ll show how to **locate** that binary and launch it with a `--user-data-dir` argument to set up your profile. You can then point `BrowserConfig.user_data_dir` to that folder for subsequent crawls.
+
+---
+
+### Creating a User Data Directory (Command-Line Approach via Playwright)
+
+If you installed Crawl4AI (which installs Playwright under the hood), you already have a Playwright-managed Chromium on your system. Follow these steps to launch that **Chromium** from your command line, specifying a **custom** data directory:
+
+1. **Find** the Playwright Chromium binary:
+   - On most systems, installed browsers go under a `~/.cache/ms-playwright/` folder or similar path.  
+   - To see an overview of installed browsers, run:
+     ```bash
+     python -m playwright install --dry-run
+     ```
+     or
+     ```bash
+     playwright install --dry-run
+     ```
+     (depending on your environment). This shows where Playwright keeps Chromium.
+
+   - For instance, you might see a path like:
+     ```
+     ~/.cache/ms-playwright/chromium-1234/chrome-linux/chrome
+     ```
+     on Linux, or a corresponding folder on macOS/Windows.
+
+2. **Launch** the Playwright Chromium binary with a **custom** user-data directory:
+   ```bash
+   # Linux example
+   ~/.cache/ms-playwright/chromium-1234/chrome-linux/chrome \
+       --user-data-dir=/home/<you>/my_chrome_profile
+   ```
+   ```bash
+   # macOS example (Playwright’s internal binary)
+   ~/Library/Caches/ms-playwright/chromium-1234/chrome-mac/Chromium.app/Contents/MacOS/Chromium \
+       --user-data-dir=/Users/<you>/my_chrome_profile
+   ```
+   ```powershell
+   # Windows example (PowerShell/cmd)
+   "C:\Users\<you>\AppData\Local\ms-playwright\chromium-1234\chrome-win\chrome.exe" ^
+       --user-data-dir="C:\Users\<you>\my_chrome_profile"
+   ```
+   
+   **Replace** the path with the actual subfolder indicated in your `ms-playwright` cache structure.  
+   - This **opens** a fresh Chromium with your new or existing data folder.  
+   - **Log into** any sites or configure your browser the way you want.  
+   - **Close** when done—your profile data is saved in that folder.
+
+3. **Use** that folder in **`BrowserConfig.user_data_dir`**:
+   ```python
+   from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+   browser_config = BrowserConfig(
+       headless=True,
+       use_managed_browser=True,
+       user_data_dir="/home/<you>/my_chrome_profile",
+       browser_type="chromium"
+   )
+   ```
+   - Next time you run your code, it reuses that folder—**preserving** your session data, cookies, local storage, etc.
+
+---
+
+## 3. Using Managed Browsers in Crawl4AI
+
+Once you have a data directory with your session data, pass it to **`BrowserConfig`**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    # 1) Reference your persistent data directory
+    browser_config = BrowserConfig(
+        headless=True,             # 'True' for automated runs
+        verbose=True,
+        use_managed_browser=True,  # Enables persistent browser strategy
+        browser_type="chromium",
+        user_data_dir="/path/to/my-chrome-profile"
+    )
+
+    # 2) Standard crawl config
+    crawl_config = CrawlerRunConfig(
+        wait_for="css:.logged-in-content"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url="https://example.com/private", config=crawl_config)
+        if result.success:
+            print("Successfully accessed private data with your identity!")
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### Workflow
+
+1. **Login** externally (via CLI or your normal Chrome with `--user-data-dir=...`).  
+2. **Close** that browser.  
+3. **Use** the same folder in `user_data_dir=` in Crawl4AI.  
+4. **Crawl** – The site sees your identity as if you’re the same user who just logged in.
+
+---
+
+## 4. Magic Mode: Simplified Automation
+
+If you **don’t** need a persistent profile or identity-based approach, **Magic Mode** offers a quick way to simulate human-like browsing without storing long-term data.
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        config=CrawlerRunConfig(
+            magic=True,  # Simplifies a lot of interaction
+            remove_overlay_elements=True,
+            page_timeout=60000
+        )
+    )
+```
+
+**Magic Mode**:
+
+- Simulates a user-like experience  
+- Randomizes user agent & navigator
+- Randomizes interactions & timings  
+- Masks automation signals  
+- Attempts pop-up handling  
+
+**But** it’s no substitute for **true** user-based sessions if you want a fully legitimate identity-based solution.
+
+---
+
+## 5. Comparing Managed Browsers vs. Magic Mode
+
+| Feature                    | **Managed Browsers**                                           | **Magic Mode**                                     |
+|----------------------------|---------------------------------------------------------------|-----------------------------------------------------|
+| **Session Persistence**    | Full localStorage/cookies retained in user_data_dir           | No persistent data (fresh each run)                |
+| **Genuine Identity**       | Real user profile with full rights & preferences              | Emulated user-like patterns, but no actual identity |
+| **Complex Sites**          | Best for login-gated sites or heavy config                    | Simple tasks, minimal login or config needed        |
+| **Setup**                  | External creation of user_data_dir, then use in Crawl4AI       | Single-line approach (`magic=True`)                 |
+| **Reliability**            | Extremely consistent (same data across runs)                  | Good for smaller tasks, can be less stable          |
+
+---
+
+## 6. Using the BrowserProfiler Class
+
+Crawl4AI provides a dedicated `BrowserProfiler` class for managing browser profiles, making it easy to create, list, and delete profiles for identity-based browsing.
+
+### Creating and Managing Profiles with BrowserProfiler
+
+The `BrowserProfiler` class offers a comprehensive API for browser profile management:
+
+```python
+import asyncio
+from crawl4ai import BrowserProfiler
+
+async def manage_profiles():
+    # Create a profiler instance
+    profiler = BrowserProfiler()
+    
+    # Create a profile interactively - opens a browser window
+    profile_path = await profiler.create_profile(
+        profile_name="my-login-profile"  # Optional: name your profile
+    )
+    
+    print(f"Profile saved at: {profile_path}")
+    
+    # List all available profiles
+    profiles = profiler.list_profiles()
+    
+    for profile in profiles:
+        print(f"Profile: {profile['name']}")
+        print(f"  Path: {profile['path']}")
+        print(f"  Created: {profile['created']}")
+        print(f"  Browser type: {profile['type']}")
+    
+    # Get a specific profile path by name
+    specific_profile = profiler.get_profile_path("my-login-profile")
+    
+    # Delete a profile when no longer needed
+    success = profiler.delete_profile("old-profile-name")
+    
+asyncio.run(manage_profiles())
+```
+
+**How profile creation works:**
+1. A browser window opens for you to interact with
+2. You log in to websites, set preferences, etc.
+3. When you're done, press 'q' in the terminal to close the browser
+4. The profile is saved in the Crawl4AI profiles directory
+5. You can use the returned path with `BrowserConfig.user_data_dir`
+
+### Interactive Profile Management
+
+The `BrowserProfiler` also offers an interactive management console that guides you through profile creation, listing, and deletion:
+
+```python
+import asyncio
+from crawl4ai import BrowserProfiler, AsyncWebCrawler, BrowserConfig
+
+# Define a function to use a profile for crawling
+async def crawl_with_profile(profile_path, url):
+    browser_config = BrowserConfig(
+        headless=True,
+        use_managed_browser=True,
+        user_data_dir=profile_path
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url)
+        return result
+
+async def main():
+    # Create a profiler instance
+    profiler = BrowserProfiler()
+    
+    # Launch the interactive profile manager
+    # Passing the crawl function as a callback adds a "crawl with profile" option
+    await profiler.interactive_manager(crawl_callback=crawl_with_profile)
+    
+asyncio.run(main())
+```
+
+### Legacy Methods
+
+For backward compatibility, the previous methods on `ManagedBrowser` are still available, but they delegate to the new `BrowserProfiler` class:
+
+```python
+from crawl4ai.browser_manager import ManagedBrowser
+
+# These methods still work but use BrowserProfiler internally
+profiles = ManagedBrowser.list_profiles()
+```
+
+### Complete Example
+
+See the full example in `docs/examples/identity_based_browsing.py` for a complete demonstration of creating and using profiles for authenticated browsing using the new `BrowserProfiler` class.
+
+---
+
+## 7. Summary
+
+- **Create** your user-data directory either:
+  - By launching Chrome/Chromium externally with `--user-data-dir=/some/path` 
+  - Or by using the built-in `BrowserProfiler.create_profile()` method
+  - Or through the interactive interface with `profiler.interactive_manager()`
+- **Log in** or configure sites as needed, then close the browser
+- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
+- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
+- **Manage** your profiles with the dedicated `BrowserProfiler` class
+- Enjoy **persistent** sessions that reflect your real identity
+- If you only need quick, ephemeral automation, **Magic Mode** might suffice
+
+**Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary.
+
+With these approaches, you preserve your **authentic** browsing environment, ensuring the site sees you exactly as a normal user—no repeated logins or wasted time.
+```
+
+
+## File: docs/md_v2/advanced/lazy-loading.md
+
+```md
+## Handling Lazy-Loaded Images
+
+Many websites now load images **lazily** as you scroll. If you need to ensure they appear in your final crawl (and in `result.media`), consider:
+
+1. **`wait_for_images=True`** – Wait for images to fully load.  
+2. **`scan_full_page`** – Force the crawler to scroll the entire page, triggering lazy loads.  
+3. **`scroll_delay`** – Add small delays between scroll steps.  
+
+**Note**: If the site requires multiple “Load More” triggers or complex interactions, see the [Page Interaction docs](../core/page-interaction.md).
+
+### Example: Ensuring Lazy Images Appear
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
+from crawl4ai.async_configs import CacheMode
+
+async def main():
+    config = CrawlerRunConfig(
+        # Force the crawler to wait until images are fully loaded
+        wait_for_images=True,
+
+        # Option 1: If you want to automatically scroll the page to load images
+        scan_full_page=True,  # Tells the crawler to try scrolling the entire page
+        scroll_delay=0.5,     # Delay (seconds) between scroll steps
+
+        # Option 2: If the site uses a 'Load More' or JS triggers for images,
+        # you can also specify js_code or wait_for logic here.
+
+        cache_mode=CacheMode.BYPASS,
+        verbose=True
+    )
+
+    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
+        result = await crawler.arun("https://www.example.com/gallery", config=config)
+        
+        if result.success:
+            images = result.media.get("images", [])
+            print("Images found:", len(images))
+            for i, img in enumerate(images[:5]):
+                print(f"[Image {i}] URL: {img['src']}, Score: {img.get('score','N/A')}")
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Explanation**:
+
+- **`wait_for_images=True`**  
+  The crawler tries to ensure images have finished loading before finalizing the HTML.  
+- **`scan_full_page=True`**  
+  Tells the crawler to attempt scrolling from top to bottom. Each scroll step helps trigger lazy loading.  
+- **`scroll_delay=0.5`**  
+  Pause half a second between each scroll step. Helps the site load images before continuing.
+
+**When to Use**:
+
+- **Lazy-Loading**: If images appear only when the user scrolls into view, `scan_full_page` + `scroll_delay` helps the crawler see them.  
+- **Heavier Pages**: If a page is extremely long, be mindful that scanning the entire page can be slow. Adjust `scroll_delay` or the max scroll steps as needed.
+
+---
+
+## Combining with Other Link & Media Filters
+
+You can still combine **lazy-load** logic with the usual **exclude_external_images**, **exclude_domains**, or link filtration:
+
+```python
+config = CrawlerRunConfig(
+    wait_for_images=True,
+    scan_full_page=True,
+    scroll_delay=0.5,
+
+    # Filter out external images if you only want local ones
+    exclude_external_images=True,
+
+    # Exclude certain domains for links
+    exclude_domains=["spammycdn.com"],
+)
+```
+
+This approach ensures you see **all** images from the main domain while ignoring external ones, and the crawler physically scrolls the entire page so that lazy-loading triggers.
+
+---
+
+## Tips & Troubleshooting
+
+1. **Long Pages**  
+   - Setting `scan_full_page=True` on extremely long or infinite-scroll pages can be resource-intensive.  
+   - Consider using [hooks](../core/page-interaction.md) or specialized logic to load specific sections or “Load More” triggers repeatedly.
+
+2. **Mixed Image Behavior**  
+   - Some sites load images in batches as you scroll. If you’re missing images, increase your `scroll_delay` or call multiple partial scrolls in a loop with JS code or hooks.
+
+3. **Combining with Dynamic Wait**  
+   - If the site has a placeholder that only changes to a real image after a certain event, you might do `wait_for="css:img.loaded"` or a custom JS `wait_for`.
+
+4. **Caching**  
+   - If `cache_mode` is enabled, repeated crawls might skip some network fetches. If you suspect caching is missing new images, set `cache_mode=CacheMode.BYPASS` for fresh fetches.
+
+---
+
+With **lazy-loading** support, **wait_for_images**, and **scan_full_page** settings, you can capture the entire gallery or feed of images you expect—even if the site only loads them as the user scrolls. Combine these with the standard media filtering and domain exclusion for a complete link & media handling strategy.
+```
+
+
+## File: docs/md_v2/advanced/multi-url-crawling.md
+
+```md
+# Advanced Multi-URL Crawling with Dispatchers
+
+> **Heads Up**: Crawl4AI supports advanced dispatchers for **parallel** or **throttled** crawling, providing dynamic rate limiting and memory usage checks. The built-in `arun_many()` function uses these dispatchers to handle concurrency efficiently.
+
+## 1. Introduction
+
+When crawling many URLs:
+
+- **Basic**: Use `arun()` in a loop (simple but less efficient)
+- **Better**: Use `arun_many()`, which efficiently handles multiple URLs with proper concurrency control
+- **Best**: Customize dispatcher behavior for your specific needs (memory management, rate limits, etc.)
+
+**Why Dispatchers?**  
+
+- **Adaptive**: Memory-based dispatchers can pause or slow down based on system resources
+- **Rate-limiting**: Built-in rate limiting with exponential backoff for 429/503 responses
+- **Real-time Monitoring**: Live dashboard of ongoing tasks, memory usage, and performance
+- **Flexibility**: Choose between memory-adaptive or semaphore-based concurrency
+
+---
+
+## 2. Core Components
+
+### 2.1 Rate Limiter
+
+```python
+class RateLimiter:
+    def __init__(
+        # Random delay range between requests
+        base_delay: Tuple[float, float] = (1.0, 3.0),  
+        
+        # Maximum backoff delay
+        max_delay: float = 60.0,                        
+        
+        # Retries before giving up
+        max_retries: int = 3,                          
+        
+        # Status codes triggering backoff
+        rate_limit_codes: List[int] = [429, 503]        
+    )
+```
+
+Here’s the revised and simplified explanation of the **RateLimiter**, focusing on constructor parameters and adhering to your markdown style and mkDocs guidelines.
+
+#### RateLimiter Constructor Parameters
+
+The **RateLimiter** is a utility that helps manage the pace of requests to avoid overloading servers or getting blocked due to rate limits. It operates internally to delay requests and handle retries but can be configured using its constructor parameters.
+
+**Parameters of the `RateLimiter` constructor:**
+
+1. **`base_delay`** (`Tuple[float, float]`, default: `(1.0, 3.0)`)  
+  The range for a random delay (in seconds) between consecutive requests to the same domain.
+
+- A random delay is chosen between `base_delay[0]` and `base_delay[1]` for each request.  
+- This prevents sending requests at a predictable frequency, reducing the chances of triggering rate limits.
+
+**Example:**  
+If `base_delay = (2.0, 5.0)`, delays could be randomly chosen as `2.3s`, `4.1s`, etc.
+
+---
+
+2. **`max_delay`** (`float`, default: `60.0`)  
+  The maximum allowable delay when rate-limiting errors occur.
+
+- When servers return rate-limit responses (e.g., 429 or 503), the delay increases exponentially with jitter.  
+- The `max_delay` ensures the delay doesn’t grow unreasonably high, capping it at this value.
+
+**Example:**  
+For a `max_delay = 30.0`, even if backoff calculations suggest a delay of `45s`, it will cap at `30s`.
+
+---
+
+3. **`max_retries`** (`int`, default: `3`)  
+  The maximum number of retries for a request if rate-limiting errors occur.
+
+- After encountering a rate-limit response, the `RateLimiter` retries the request up to this number of times.  
+- If all retries fail, the request is marked as failed, and the process continues.
+
+**Example:**  
+If `max_retries = 3`, the system retries a failed request three times before giving up.
+
+---
+
+4. **`rate_limit_codes`** (`List[int]`, default: `[429, 503]`)  
+  A list of HTTP status codes that trigger the rate-limiting logic.
+
+- These status codes indicate the server is overwhelmed or actively limiting requests.  
+- You can customize this list to include other codes based on specific server behavior.
+
+**Example:**  
+If `rate_limit_codes = [429, 503, 504]`, the crawler will back off on these three error codes.
+
+---
+
+**How to Use the `RateLimiter`:**
+
+Here’s an example of initializing and using a `RateLimiter` in your project:
+
+```python
+from crawl4ai import RateLimiter
+
+# Create a RateLimiter with custom settings
+rate_limiter = RateLimiter(
+    base_delay=(2.0, 4.0),  # Random delay between 2-4 seconds
+    max_delay=30.0,         # Cap delay at 30 seconds
+    max_retries=5,          # Retry up to 5 times on rate-limiting errors
+    rate_limit_codes=[429, 503]  # Handle these HTTP status codes
+)
+
+# RateLimiter will handle delays and retries internally
+# No additional setup is required for its operation
+```
+
+The `RateLimiter` integrates seamlessly with dispatchers like `MemoryAdaptiveDispatcher` and `SemaphoreDispatcher`, ensuring requests are paced correctly without user intervention. Its internal mechanisms manage delays and retries to avoid overwhelming servers while maximizing efficiency.
+
+
+### 2.2 Crawler Monitor
+
+The CrawlerMonitor provides real-time visibility into crawling operations:
+
+```python
+from crawl4ai import CrawlerMonitor, DisplayMode
+monitor = CrawlerMonitor(
+    # Maximum rows in live display
+    max_visible_rows=15,          
+
+    # DETAILED or AGGREGATED view
+    display_mode=DisplayMode.DETAILED  
+)
+```
+
+**Display Modes**:
+
+1. **DETAILED**: Shows individual task status, memory usage, and timing
+2. **AGGREGATED**: Displays summary statistics and overall progress
+
+---
+
+## 3. Available Dispatchers
+
+### 3.1 MemoryAdaptiveDispatcher (Default)
+
+Automatically manages concurrency based on system memory usage:
+
+```python
+from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
+
+dispatcher = MemoryAdaptiveDispatcher(
+    memory_threshold_percent=90.0,  # Pause if memory exceeds this
+    check_interval=1.0,             # How often to check memory
+    max_session_permit=10,          # Maximum concurrent tasks
+    rate_limiter=RateLimiter(       # Optional rate limiting
+        base_delay=(1.0, 2.0),
+        max_delay=30.0,
+        max_retries=2
+    ),
+    monitor=CrawlerMonitor(         # Optional monitoring
+        max_visible_rows=15,
+        display_mode=DisplayMode.DETAILED
+    )
+)
+```
+
+**Constructor Parameters:**
+
+1. **`memory_threshold_percent`** (`float`, default: `90.0`)  
+  Specifies the memory usage threshold (as a percentage). If system memory usage exceeds this value, the dispatcher pauses crawling to prevent system overload.
+
+2. **`check_interval`** (`float`, default: `1.0`)  
+  The interval (in seconds) at which the dispatcher checks system memory usage.
+
+3. **`max_session_permit`** (`int`, default: `10`)  
+  The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
+
+4. **`memory_wait_timeout`** (`float`, default: `300.0`)  
+  Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
+
+5. **`rate_limiter`** (`RateLimiter`, default: `None`)  
+  Optional rate-limiting logic to avoid server-side blocking (e.g., for handling 429 or 503 errors). See **RateLimiter** for details.
+
+6. **`monitor`** (`CrawlerMonitor`, default: `None`)  
+  Optional monitoring for real-time task tracking and performance insights. See **CrawlerMonitor** for details.
+
+---
+
+### 3.2 SemaphoreDispatcher
+
+Provides simple concurrency control with a fixed limit:
+
+```python
+from crawl4ai.async_dispatcher import SemaphoreDispatcher
+
+dispatcher = SemaphoreDispatcher(
+    max_session_permit=20,         # Maximum concurrent tasks
+    rate_limiter=RateLimiter(      # Optional rate limiting
+        base_delay=(0.5, 1.0),
+        max_delay=10.0
+    ),
+    monitor=CrawlerMonitor(        # Optional monitoring
+        max_visible_rows=15,
+        display_mode=DisplayMode.DETAILED
+    )
+)
+```
+
+**Constructor Parameters:**
+
+1. **`max_session_permit`** (`int`, default: `20`)  
+  The maximum number of concurrent crawling tasks allowed, irrespective of semaphore slots.
+
+2. **`rate_limiter`** (`RateLimiter`, default: `None`)  
+  Optional rate-limiting logic to avoid overwhelming servers. See **RateLimiter** for details.
+
+3. **`monitor`** (`CrawlerMonitor`, default: `None`)  
+  Optional monitoring for tracking task progress and resource usage. See **CrawlerMonitor** for details.
+
+---
+
+## 4. Usage Examples
+
+### 4.1 Batch Processing (Default)
+
+```python
+async def crawl_batch():
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=False  # Default: get all results at once
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=10,
+        monitor=CrawlerMonitor(
+            display_mode=DisplayMode.DETAILED
+        )
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Get all results at once
+        results = await crawler.arun_many(
+            urls=urls,
+            config=run_config,
+            dispatcher=dispatcher
+        )
+        
+        # Process all results after completion
+        for result in results:
+            if result.success:
+                await process_result(result)
+            else:
+                print(f"Failed to crawl {result.url}: {result.error_message}")
+```
+
+**Review:**  
+- **Purpose:** Executes a batch crawl with all URLs processed together after crawling is complete.  
+- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` to manage concurrency and system memory.  
+- **Stream:** Disabled (`stream=False`), so all results are collected at once for post-processing.  
+- **Best Use Case:** When you need to analyze results in bulk rather than individually during the crawl.
+
+---
+
+### 4.2 Streaming Mode
+
+```python
+async def crawl_streaming():
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=True  # Enable streaming mode
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=10,
+        monitor=CrawlerMonitor(
+            display_mode=DisplayMode.DETAILED
+        )
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Process results as they become available
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=run_config,
+            dispatcher=dispatcher
+        ):
+            if result.success:
+                # Process each result immediately
+                await process_result(result)
+            else:
+                print(f"Failed to crawl {result.url}: {result.error_message}")
+```
+
+**Review:**  
+- **Purpose:** Enables streaming to process results as soon as they’re available.  
+- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` for concurrency and memory management.  
+- **Stream:** Enabled (`stream=True`), allowing real-time processing during crawling.  
+- **Best Use Case:** When you need to act on results immediately, such as for real-time analytics or progressive data storage.
+
+---
+
+### 4.3 Semaphore-based Crawling
+
+```python
+async def crawl_with_semaphore(urls):
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+    
+    dispatcher = SemaphoreDispatcher(
+        semaphore_count=5,
+        rate_limiter=RateLimiter(
+            base_delay=(0.5, 1.0),
+            max_delay=10.0
+        ),
+        monitor=CrawlerMonitor(
+            max_visible_rows=15,
+            display_mode=DisplayMode.DETAILED
+        )
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        results = await crawler.arun_many(
+            urls, 
+            config=run_config,
+            dispatcher=dispatcher
+        )
+        return results
+```
+
+**Review:**  
+- **Purpose:** Uses `SemaphoreDispatcher` to limit concurrency with a fixed number of slots.  
+- **Dispatcher:** Configured with a semaphore to control parallel crawling tasks.  
+- **Rate Limiter:** Prevents servers from being overwhelmed by pacing requests.  
+- **Best Use Case:** When you want precise control over the number of concurrent requests, independent of system memory.
+
+---
+
+### 4.4 Robots.txt Consideration
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    urls = [
+        "https://example1.com",
+        "https://example2.com",
+        "https://example3.com"
+    ]
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,
+        check_robots_txt=True,  # Will respect robots.txt for each URL
+        semaphore_count=3      # Max concurrent requests
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in crawler.arun_many(urls, config=config):
+            if result.success:
+                print(f"Successfully crawled {result.url}")
+            elif result.status_code == 403 and "robots.txt" in result.error_message:
+                print(f"Skipped {result.url} - blocked by robots.txt")
+            else:
+                print(f"Failed to crawl {result.url}: {result.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Review:**  
+- **Purpose:** Ensures compliance with `robots.txt` rules for ethical and legal web crawling.  
+- **Configuration:** Set `check_robots_txt=True` to validate each URL against `robots.txt` before crawling.  
+- **Dispatcher:** Handles requests with concurrency limits (`semaphore_count=3`).  
+- **Best Use Case:** When crawling websites that strictly enforce robots.txt policies or for responsible crawling practices.
+
+---
+
+## 5. Dispatch Results
+
+Each crawl result includes dispatch information:
+
+```python
+@dataclass
+class DispatchResult:
+    task_id: str
+    memory_usage: float
+    peak_memory: float
+    start_time: datetime
+    end_time: datetime
+    error_message: str = ""
+```
+
+Access via `result.dispatch_result`:
+
+```python
+for result in results:
+    if result.success:
+        dr = result.dispatch_result
+        print(f"URL: {result.url}")
+        print(f"Memory: {dr.memory_usage:.1f}MB")
+        print(f"Duration: {dr.end_time - dr.start_time}")
+```
+
+## 6. Summary
+
+1. **Two Dispatcher Types**:
+
+   - MemoryAdaptiveDispatcher (default): Dynamic concurrency based on memory
+   - SemaphoreDispatcher: Fixed concurrency limit
+
+2. **Optional Components**:
+
+   - RateLimiter: Smart request pacing and backoff
+   - CrawlerMonitor: Real-time progress visualization
+
+3. **Key Benefits**:
+
+   - Automatic memory management
+   - Built-in rate limiting
+   - Live progress monitoring
+   - Flexible concurrency control
+
+Choose the dispatcher that best fits your needs:
+
+- **MemoryAdaptiveDispatcher**: For large crawls or limited resources
+- **SemaphoreDispatcher**: For simple, fixed-concurrency scenarios
+
+```
+
+
+## File: docs/md_v2/advanced/network-console-capture.md
+
+```md
+# Network Requests & Console Message Capturing
+
+Crawl4AI can capture all network requests and browser console messages during a crawl, which is invaluable for debugging, security analysis, or understanding page behavior.
+
+## Configuration
+
+To enable network and console capturing, use these configuration options:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+# Enable both network request capture and console message capture
+config = CrawlerRunConfig(
+    capture_network_requests=True,  # Capture all network requests and responses
+    capture_console_messages=True   # Capture all browser console output
+)
+```
+
+## Example Usage
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Enable both network request capture and console message capture
+    config = CrawlerRunConfig(
+        capture_network_requests=True,
+        capture_console_messages=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=config
+        )
+        
+        if result.success:
+            # Analyze network requests
+            if result.network_requests:
+                print(f"Captured {len(result.network_requests)} network events")
+                
+                # Count request types
+                request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
+                response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
+                failed_count = len([r for r in result.network_requests if r.get("event_type") == "request_failed"])
+                
+                print(f"Requests: {request_count}, Responses: {response_count}, Failed: {failed_count}")
+                
+                # Find API calls
+                api_calls = [r for r in result.network_requests 
+                            if r.get("event_type") == "request" and "api" in r.get("url", "")]
+                if api_calls:
+                    print(f"Detected {len(api_calls)} API calls:")
+                    for call in api_calls[:3]:  # Show first 3
+                        print(f"  - {call.get('method')} {call.get('url')}")
+            
+            # Analyze console messages
+            if result.console_messages:
+                print(f"Captured {len(result.console_messages)} console messages")
+                
+                # Group by type
+                message_types = {}
+                for msg in result.console_messages:
+                    msg_type = msg.get("type", "unknown")
+                    message_types[msg_type] = message_types.get(msg_type, 0) + 1
+                
+                print("Message types:", message_types)
+                
+                # Show errors (often the most important)
+                errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
+                if errors:
+                    print(f"Found {len(errors)} console errors:")
+                    for err in errors[:2]:  # Show first 2
+                        print(f"  - {err.get('text', '')[:100]}")
+            
+            # Export all captured data to a file for detailed analysis
+            with open("network_capture.json", "w") as f:
+                json.dump({
+                    "url": result.url,
+                    "network_requests": result.network_requests or [],
+                    "console_messages": result.console_messages or []
+                }, f, indent=2)
+            
+            print("Exported detailed capture data to network_capture.json")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Captured Data Structure
+
+### Network Requests
+
+The `result.network_requests` contains a list of dictionaries, each representing a network event with these common fields:
+
+| Field | Description |
+|-------|-------------|
+| `event_type` | Type of event: `"request"`, `"response"`, or `"request_failed"` |
+| `url` | The URL of the request |
+| `timestamp` | Unix timestamp when the event was captured |
+
+#### Request Event Fields
+
+```json
+{
+  "event_type": "request",
+  "url": "https://example.com/api/data.json",
+  "method": "GET",
+  "headers": {"User-Agent": "...", "Accept": "..."},
+  "post_data": "key=value&otherkey=value",
+  "resource_type": "fetch",
+  "is_navigation_request": false,
+  "timestamp": 1633456789.123
+}
+```
+
+#### Response Event Fields
+
+```json
+{
+  "event_type": "response",
+  "url": "https://example.com/api/data.json",
+  "status": 200,
+  "status_text": "OK",
+  "headers": {"Content-Type": "application/json", "Cache-Control": "..."},
+  "from_service_worker": false,
+  "request_timing": {"requestTime": 1234.56, "receiveHeadersEnd": 1234.78},
+  "timestamp": 1633456789.456
+}
+```
+
+#### Failed Request Event Fields
+
+```json
+{
+  "event_type": "request_failed",
+  "url": "https://example.com/missing.png",
+  "method": "GET",
+  "resource_type": "image",
+  "failure_text": "net::ERR_ABORTED 404",
+  "timestamp": 1633456789.789
+}
+```
+
+### Console Messages
+
+The `result.console_messages` contains a list of dictionaries, each representing a console message with these common fields:
+
+| Field | Description |
+|-------|-------------|
+| `type` | Message type: `"log"`, `"error"`, `"warning"`, `"info"`, etc. |
+| `text` | The message text |
+| `timestamp` | Unix timestamp when the message was captured |
+
+#### Console Message Example
+
+```json
+{
+  "type": "error",
+  "text": "Uncaught TypeError: Cannot read property 'length' of undefined",
+  "location": "https://example.com/script.js:123:45",
+  "timestamp": 1633456790.123
+}
+```
+
+## Key Benefits
+
+- **Full Request Visibility**: Capture all network activity including:
+  - Requests (URLs, methods, headers, post data)
+  - Responses (status codes, headers, timing)
+  - Failed requests (with error messages)
+  
+- **Console Message Access**: View all JavaScript console output:
+  - Log messages
+  - Warnings
+  - Errors with stack traces
+  - Developer debugging information
+
+- **Debugging Power**: Identify issues such as:
+  - Failed API calls or resource loading
+  - JavaScript errors affecting page functionality
+  - CORS or other security issues
+  - Hidden API endpoints and data flows
+
+- **Security Analysis**: Detect:
+  - Unexpected third-party requests
+  - Data leakage in request payloads
+  - Suspicious script behavior
+
+- **Performance Insights**: Analyze:
+  - Request timing data
+  - Resource loading patterns
+  - Potential bottlenecks
+
+## Use Cases
+
+1. **API Discovery**: Identify hidden endpoints and data flows in single-page applications
+2. **Debugging**: Track down JavaScript errors affecting page functionality
+3. **Security Auditing**: Detect unwanted third-party requests or data leakage
+4. **Performance Analysis**: Identify slow-loading resources
+5. **Ad/Tracker Analysis**: Detect and catalog advertising or tracking calls
+
+This capability is especially valuable for complex sites with heavy JavaScript, single-page applications, or when you need to understand the exact communication happening between a browser and servers.
+```
+
+
+## File: docs/md_v2/advanced/proxy-security.md
+
+```md
+# Proxy 
+
+## Basic Proxy Setup
+
+Simple proxy configuration with `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+# Using proxy URL
+browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com")
+
+# Using SOCKS proxy
+browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080")
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+## Authenticated Proxy
+
+Use an authenticated proxy with `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+proxy_config = {
+    "server": "http://proxy.example.com:8080",
+    "username": "user",
+    "password": "pass"
+}
+
+browser_config = BrowserConfig(proxy_config=proxy_config)
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+Here's the corrected documentation:
+
+## Rotating Proxies 
+
+Example using a proxy rotation service dynamically:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def get_next_proxy():
+    # Your proxy rotation logic here
+    return {"server": "http://next.proxy.com:8080"}
+
+async def main():
+    browser_config = BrowserConfig()
+    run_config = CrawlerRunConfig()
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # For each URL, create a new run config with different proxy
+        for url in urls:
+            proxy = await get_next_proxy()
+            # Clone the config and update proxy - this creates a new browser context
+            current_config = run_config.clone(proxy_config=proxy)
+            result = await crawler.arun(url=url, config=current_config)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+
+```
+
+
+## File: docs/md_v2/advanced/session-management.md
+
+```md
+# Session Management
+
+Session management in Crawl4AI is a powerful feature that allows you to maintain state across multiple requests, making it particularly suitable for handling complex multi-step crawling tasks. It enables you to reuse the same browser tab (or page object) across sequential actions and crawls, which is beneficial for:
+
+- **Performing JavaScript actions before and after crawling.**
+- **Executing multiple sequential crawls faster** without needing to reopen tabs or allocate memory repeatedly.
+
+**Note:** This feature is designed for sequential workflows and is not suitable for parallel operations.
+
+---
+
+#### Basic Session Usage
+
+Use `BrowserConfig` and `CrawlerRunConfig` to maintain state with a `session_id`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+    session_id = "my_session"
+
+    # Define configurations
+    config1 = CrawlerRunConfig(
+        url="https://example.com/page1", session_id=session_id
+    )
+    config2 = CrawlerRunConfig(
+        url="https://example.com/page2", session_id=session_id
+    )
+
+    # First request
+    result1 = await crawler.arun(config=config1)
+
+    # Subsequent request using the same session
+    result2 = await crawler.arun(config=config2)
+
+    # Clean up when done
+    await crawler.crawler_strategy.kill_session(session_id)
+```
+
+---
+
+#### Dynamic Content with Sessions
+
+Here's an example of crawling GitHub commits across multiple pages while preserving session state:
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.cache_context import CacheMode
+
+async def crawl_dynamic_content():
+    async with AsyncWebCrawler() as crawler:
+        session_id = "github_commits_session"
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        all_commits = []
+
+        # Define extraction schema
+        schema = {
+            "name": "Commit Extractor",
+            "baseSelector": "li.Box-sc-g0xbh4-0",
+            "fields": [{
+                "name": "title", "selector": "h4.markdown-title", "type": "text"
+            }],
+        }
+        extraction_strategy = JsonCssExtractionStrategy(schema)
+
+        # JavaScript and wait configurations
+        js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
+        wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
+
+        # Crawl multiple pages
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                extraction_strategy=extraction_strategy,
+                js_code=js_next_page if page > 0 else None,
+                wait_for=wait_for if page > 0 else None,
+                js_only=page > 0,
+                cache_mode=CacheMode.BYPASS
+            )
+
+            result = await crawler.arun(config=config)
+            if result.success:
+                commits = json.loads(result.extracted_content)
+                all_commits.extend(commits)
+                print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        # Clean up session
+        await crawler.crawler_strategy.kill_session(session_id)
+        return all_commits
+```
+
+---
+
+## Example 1: Basic Session-Based Crawling
+
+A simple example using session-based crawling:
+
+```python
+import asyncio
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.cache_context import CacheMode
+
+async def basic_session_crawl():
+    async with AsyncWebCrawler() as crawler:
+        session_id = "dynamic_content_session"
+        url = "https://example.com/dynamic-content"
+
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
+                css_selector=".content-item",
+                cache_mode=CacheMode.BYPASS
+            )
+            
+            result = await crawler.arun(config=config)
+            print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(basic_session_crawl())
+```
+
+This example shows:
+1. Reusing the same `session_id` across multiple requests.
+2. Executing JavaScript to load more content dynamically.
+3. Properly closing the session to free resources.
+
+---
+
+## Advanced Technique 1: Custom Execution Hooks
+
+> Warning: You might feel confused by the end of the next few examples 😅, so make sure you are comfortable with the order of the parts before you start this.
+
+Use custom hooks to handle complex scenarios, such as waiting for content to load dynamically:
+
+```python
+async def advanced_session_crawl_with_hooks():
+    first_commit = ""
+
+    async def on_execution_started(page):
+        nonlocal first_commit
+        try:
+            while True:
+                await page.wait_for_selector("li.commit-item h4")
+                commit = await page.query_selector("li.commit-item h4")
+                commit = await commit.evaluate("(element) => element.textContent").strip()
+                if commit and commit != first_commit:
+                    first_commit = commit
+                    break
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            print(f"Warning: New content didn't appear: {e}")
+
+    async with AsyncWebCrawler() as crawler:
+        session_id = "commit_session"
+        url = "https://github.com/example/repo/commits/main"
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+        js_next_page = """document.querySelector('a.pagination-next').click();"""
+
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                js_code=js_next_page if page > 0 else None,
+                css_selector="li.commit-item",
+                js_only=page > 0,
+                cache_mode=CacheMode.BYPASS
+            )
+
+            result = await crawler.arun(config=config)
+            print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(advanced_session_crawl_with_hooks())
+```
+
+This technique ensures new content loads before the next action.
+
+---
+
+## Advanced Technique 2: Integrated JavaScript Execution and Waiting
+
+Combine JavaScript execution and waiting logic for concise handling of dynamic content:
+
+```python
+async def integrated_js_and_wait_crawl():
+    async with AsyncWebCrawler() as crawler:
+        session_id = "integrated_session"
+        url = "https://github.com/example/repo/commits/main"
+
+        js_next_page_and_wait = """
+        (async () => {
+            const getCurrentCommit = () => document.querySelector('li.commit-item h4').textContent.trim();
+            const initialCommit = getCurrentCommit();
+            document.querySelector('a.pagination-next').click();
+            while (getCurrentCommit() === initialCommit) {
+                await new Promise(resolve => setTimeout(resolve, 100));
+            }
+        })();
+        """
+
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                js_code=js_next_page_and_wait if page > 0 else None,
+                css_selector="li.commit-item",
+                js_only=page > 0,
+                cache_mode=CacheMode.BYPASS
+            )
+
+            result = await crawler.arun(config=config)
+            print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(integrated_js_and_wait_crawl())
+```
+
+---
+
+#### Common Use Cases for Sessions
+
+1. **Authentication Flows**: Login and interact with secured pages.
+
+2. **Pagination Handling**: Navigate through multiple pages.
+
+3. **Form Submissions**: Fill forms, submit, and process results.
+
+4. **Multi-step Processes**: Complete workflows that span multiple actions.
+
+5. **Dynamic Content Navigation**: Handle JavaScript-rendered or event-triggered content.
+
+```
+
+
+## File: docs/md_v2/advanced/ssl-certificate.md
+
+```md
+# `SSLCertificate` Reference
+
+The **`SSLCertificate`** class encapsulates an SSL certificate’s data and allows exporting it in various formats (PEM, DER, JSON, or text). It’s used within **Crawl4AI** whenever you set **`fetch_ssl_certificate=True`** in your **`CrawlerRunConfig`**.  
+
+## 1. Overview
+
+**Location**: `crawl4ai/ssl_certificate.py`
+
+```python
+class SSLCertificate:
+    """
+    Represents an SSL certificate with methods to export in various formats.
+
+    Main Methods:
+    - from_url(url, timeout=10)
+    - from_file(file_path)
+    - from_binary(binary_data)
+    - to_json(filepath=None)
+    - to_pem(filepath=None)
+    - to_der(filepath=None)
+    ...
+
+    Common Properties:
+    - issuer
+    - subject
+    - valid_from
+    - valid_until
+    - fingerprint
+    """
+```
+
+### Typical Use Case
+1. You **enable** certificate fetching in your crawl by:
+   ```python
+   CrawlerRunConfig(fetch_ssl_certificate=True, ...)
+   ```
+2. After `arun()`, if `result.ssl_certificate` is present, it’s an instance of **`SSLCertificate`**.  
+3. You can **read** basic properties (issuer, subject, validity) or **export** them in multiple formats.
+
+---
+
+## 2. Construction & Fetching
+
+### 2.1 **`from_url(url, timeout=10)`**
+Manually load an SSL certificate from a given URL (port 443). Typically used internally, but you can call it directly if you want:
+
+```python
+cert = SSLCertificate.from_url("https://example.com")
+if cert:
+    print("Fingerprint:", cert.fingerprint)
+```
+
+### 2.2 **`from_file(file_path)`**
+Load from a file containing certificate data in ASN.1 or DER. Rarely needed unless you have local cert files:
+
+```python
+cert = SSLCertificate.from_file("/path/to/cert.der")
+```
+
+### 2.3 **`from_binary(binary_data)`**
+Initialize from raw binary. E.g., if you captured it from a socket or another source:
+
+```python
+cert = SSLCertificate.from_binary(raw_bytes)
+```
+
+---
+
+## 3. Common Properties
+
+After obtaining a **`SSLCertificate`** instance (e.g. `result.ssl_certificate` from a crawl), you can read:
+
+1. **`issuer`** *(dict)*  
+   - E.g. `{"CN": "My Root CA", "O": "..."}`
+2. **`subject`** *(dict)*  
+   - E.g. `{"CN": "example.com", "O": "ExampleOrg"}`
+3. **`valid_from`** *(str)*  
+   - NotBefore date/time. Often in ASN.1/UTC format.
+4. **`valid_until`** *(str)*  
+   - NotAfter date/time.
+5. **`fingerprint`** *(str)*  
+   - The SHA-256 digest (lowercase hex).  
+   - E.g. `"d14d2e..."`
+
+---
+
+## 4. Export Methods
+
+Once you have a **`SSLCertificate`** object, you can **export** or **inspect** it:
+
+### 4.1 **`to_json(filepath=None)` → `Optional[str]`**
+- Returns a JSON string containing the parsed certificate fields.  
+- If `filepath` is provided, saves it to disk instead, returning `None`.
+
+**Usage**:
+```python
+json_data = cert.to_json()  # returns JSON string
+cert.to_json("certificate.json")  # writes file, returns None
+```
+
+### 4.2 **`to_pem(filepath=None)` → `Optional[str]`**
+- Returns a PEM-encoded string (common for web servers).  
+- If `filepath` is provided, saves it to disk instead.
+
+```python
+pem_str = cert.to_pem()              # in-memory PEM string
+cert.to_pem("/path/to/cert.pem")     # saved to file
+```
+
+### 4.3 **`to_der(filepath=None)` → `Optional[bytes]`**
+- Returns the original DER (binary ASN.1) bytes.  
+- If `filepath` is specified, writes the bytes there instead.
+
+```python
+der_bytes = cert.to_der()
+cert.to_der("certificate.der")
+```
+
+### 4.4 (Optional) **`export_as_text()`**
+- If you see a method like `export_as_text()`, it typically returns an OpenSSL-style textual representation.  
+- Not always needed, but can help for debugging or manual inspection.
+
+---
+
+## 5. Example Usage in Crawl4AI
+
+Below is a minimal sample showing how the crawler obtains an SSL cert from a site, then reads or exports it. The code snippet:
+
+```python
+import asyncio
+import os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    tmp_dir = "tmp"
+    os.makedirs(tmp_dir, exist_ok=True)
+
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+            # 1. Basic Info
+            print("Issuer CN:", cert.issuer.get("CN", ""))
+            print("Valid until:", cert.valid_until)
+            print("Fingerprint:", cert.fingerprint)
+            
+            # 2. Export
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))
+            cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))
+            cert.to_der(os.path.join(tmp_dir, "certificate.der"))
+    
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 6. Notes & Best Practices
+
+1. **Timeout**: `SSLCertificate.from_url` internally uses a default **10s** socket connect and wraps SSL.  
+2. **Binary Form**: The certificate is loaded in ASN.1 (DER) form, then re-parsed by `OpenSSL.crypto`.  
+3. **Validation**: This does **not** validate the certificate chain or trust store. It only fetches and parses.  
+4. **Integration**: Within Crawl4AI, you typically just set `fetch_ssl_certificate=True` in `CrawlerRunConfig`; the final result’s `ssl_certificate` is automatically built.  
+5. **Export**: If you need to store or analyze a cert, the `to_json` and `to_pem` are quite universal.
+
+---
+
+### Summary
+
+- **`SSLCertificate`** is a convenience class for capturing and exporting the **TLS certificate** from your crawled site(s).  
+- Common usage is in the **`CrawlResult.ssl_certificate`** field, accessible after setting `fetch_ssl_certificate=True`.  
+- Offers quick access to essential certificate details (`issuer`, `subject`, `fingerprint`) and is easy to export (PEM, DER, JSON) for further analysis or server usage.
+
+Use it whenever you need **insight** into a site’s certificate or require some form of cryptographic or compliance check.
+```
+
+
+## File: docs/md_v2/extraction/chunking.md
+
+```md
+# Chunking Strategies
+Chunking strategies are critical for dividing large texts into manageable parts, enabling effective content processing and extraction. These strategies are foundational in cosine similarity-based extraction techniques, which allow users to retrieve only the most relevant chunks of content for a given query. Additionally, they facilitate direct integration into RAG (Retrieval-Augmented Generation) systems for structured and scalable workflows.
+
+### Why Use Chunking?
+1. **Cosine Similarity and Query Relevance**: Prepares chunks for semantic similarity analysis.
+2. **RAG System Integration**: Seamlessly processes and stores chunks for retrieval.
+3. **Structured Processing**: Allows for diverse segmentation methods, such as sentence-based, topic-based, or windowed approaches.
+
+### Methods of Chunking
+
+#### 1. Regex-Based Chunking
+Splits text based on regular expression patterns, useful for coarse segmentation.
+
+**Code Example**:
+```python
+class RegexChunking:
+    def __init__(self, patterns=None):
+        self.patterns = patterns or [r'\n\n']  # Default pattern for paragraphs
+
+    def chunk(self, text):
+        paragraphs = [text]
+        for pattern in self.patterns:
+            paragraphs = [seg for p in paragraphs for seg in re.split(pattern, p)]
+        return paragraphs
+
+# Example Usage
+text = """This is the first paragraph.
+
+This is the second paragraph."""
+chunker = RegexChunking()
+print(chunker.chunk(text))
+```
+
+#### 2. Sentence-Based Chunking
+Divides text into sentences using NLP tools, ideal for extracting meaningful statements.
+
+**Code Example**:
+```python
+from nltk.tokenize import sent_tokenize
+
+class NlpSentenceChunking:
+    def chunk(self, text):
+        sentences = sent_tokenize(text)
+        return [sentence.strip() for sentence in sentences]
+
+# Example Usage
+text = "This is sentence one. This is sentence two."
+chunker = NlpSentenceChunking()
+print(chunker.chunk(text))
+```
+
+#### 3. Topic-Based Segmentation
+Uses algorithms like TextTiling to create topic-coherent chunks.
+
+**Code Example**:
+```python
+from nltk.tokenize import TextTilingTokenizer
+
+class TopicSegmentationChunking:
+    def __init__(self):
+        self.tokenizer = TextTilingTokenizer()
+
+    def chunk(self, text):
+        return self.tokenizer.tokenize(text)
+
+# Example Usage
+text = """This is an introduction.
+This is a detailed discussion on the topic."""
+chunker = TopicSegmentationChunking()
+print(chunker.chunk(text))
+```
+
+#### 4. Fixed-Length Word Chunking
+Segments text into chunks of a fixed word count.
+
+**Code Example**:
+```python
+class FixedLengthWordChunking:
+    def __init__(self, chunk_size=100):
+        self.chunk_size = chunk_size
+
+    def chunk(self, text):
+        words = text.split()
+        return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)]
+
+# Example Usage
+text = "This is a long text with many words to be chunked into fixed sizes."
+chunker = FixedLengthWordChunking(chunk_size=5)
+print(chunker.chunk(text))
+```
+
+#### 5. Sliding Window Chunking
+Generates overlapping chunks for better contextual coherence.
+
+**Code Example**:
+```python
+class SlidingWindowChunking:
+    def __init__(self, window_size=100, step=50):
+        self.window_size = window_size
+        self.step = step
+
+    def chunk(self, text):
+        words = text.split()
+        chunks = []
+        for i in range(0, len(words) - self.window_size + 1, self.step):
+            chunks.append(' '.join(words[i:i + self.window_size]))
+        return chunks
+
+# Example Usage
+text = "This is a long text to demonstrate sliding window chunking."
+chunker = SlidingWindowChunking(window_size=5, step=2)
+print(chunker.chunk(text))
+```
+
+### Combining Chunking with Cosine Similarity
+To enhance the relevance of extracted content, chunking strategies can be paired with cosine similarity techniques. Here’s an example workflow:
+
+**Code Example**:
+```python
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+class CosineSimilarityExtractor:
+    def __init__(self, query):
+        self.query = query
+        self.vectorizer = TfidfVectorizer()
+
+    def find_relevant_chunks(self, chunks):
+        vectors = self.vectorizer.fit_transform([self.query] + chunks)
+        similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
+        return [(chunks[i], similarities[i]) for i in range(len(chunks))]
+
+# Example Workflow
+text = """This is a sample document. It has multiple sentences. 
+We are testing chunking and similarity."""
+
+chunker = SlidingWindowChunking(window_size=5, step=3)
+chunks = chunker.chunk(text)
+query = "testing chunking"
+extractor = CosineSimilarityExtractor(query)
+relevant_chunks = extractor.find_relevant_chunks(chunks)
+
+print(relevant_chunks)
+```
+
+```
+
+
+## File: docs/md_v2/extraction/clustring-strategies.md
+
+```md
+# Cosine Strategy
+
+The Cosine Strategy in Crawl4AI uses similarity-based clustering to identify and extract relevant content sections from web pages. This strategy is particularly useful when you need to find and extract content based on semantic similarity rather than structural patterns.
+
+## How It Works
+
+The Cosine Strategy:
+1. Breaks down page content into meaningful chunks
+2. Converts text into vector representations
+3. Calculates similarity between chunks
+4. Clusters similar content together
+5. Ranks and filters content based on relevance
+
+## Basic Usage
+
+```python
+from crawl4ai.extraction_strategy import CosineStrategy
+
+strategy = CosineStrategy(
+    semantic_filter="product reviews",    # Target content type
+    word_count_threshold=10,             # Minimum words per cluster
+    sim_threshold=0.3                    # Similarity threshold
+)
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com/reviews",
+        extraction_strategy=strategy
+    )
+    
+    content = result.extracted_content
+```
+
+## Configuration Options
+
+### Core Parameters
+
+```python
+CosineStrategy(
+    # Content Filtering
+    semantic_filter: str = None,       # Keywords/topic for content filtering
+    word_count_threshold: int = 10,    # Minimum words per cluster
+    sim_threshold: float = 0.3,        # Similarity threshold (0.0 to 1.0)
+    
+    # Clustering Parameters
+    max_dist: float = 0.2,            # Maximum distance for clustering
+    linkage_method: str = 'ward',      # Clustering linkage method
+    top_k: int = 3,                   # Number of top categories to extract
+    
+    # Model Configuration
+    model_name: str = 'sentence-transformers/all-MiniLM-L6-v2',  # Embedding model
+    
+    verbose: bool = False             # Enable logging
+)
+```
+
+### Parameter Details
+
+1. **semantic_filter**
+   - Sets the target topic or content type
+   - Use keywords relevant to your desired content
+   - Example: "technical specifications", "user reviews", "pricing information"
+
+2. **sim_threshold**
+   - Controls how similar content must be to be grouped together
+   - Higher values (e.g., 0.8) mean stricter matching
+   - Lower values (e.g., 0.3) allow more variation
+   ```python
+   # Strict matching
+   strategy = CosineStrategy(sim_threshold=0.8)
+   
+   # Loose matching
+   strategy = CosineStrategy(sim_threshold=0.3)
+   ```
+
+3. **word_count_threshold**
+   - Filters out short content blocks
+   - Helps eliminate noise and irrelevant content
+   ```python
+   # Only consider substantial paragraphs
+   strategy = CosineStrategy(word_count_threshold=50)
+   ```
+
+4. **top_k**
+   - Number of top content clusters to return
+   - Higher values return more diverse content
+   ```python
+   # Get top 5 most relevant content clusters
+   strategy = CosineStrategy(top_k=5)
+   ```
+
+## Use Cases
+
+### 1. Article Content Extraction
+```python
+strategy = CosineStrategy(
+    semantic_filter="main article content",
+    word_count_threshold=100,  # Longer blocks for articles
+    top_k=1                   # Usually want single main content
+)
+
+result = await crawler.arun(
+    url="https://example.com/blog/post",
+    extraction_strategy=strategy
+)
+```
+
+### 2. Product Review Analysis
+```python
+strategy = CosineStrategy(
+    semantic_filter="customer reviews and ratings",
+    word_count_threshold=20,   # Reviews can be shorter
+    top_k=10,                 # Get multiple reviews
+    sim_threshold=0.4         # Allow variety in review content
+)
+```
+
+### 3. Technical Documentation
+```python
+strategy = CosineStrategy(
+    semantic_filter="technical specifications documentation",
+    word_count_threshold=30,
+    sim_threshold=0.6,        # Stricter matching for technical content
+    max_dist=0.3             # Allow related technical sections
+)
+```
+
+## Advanced Features
+
+### Custom Clustering
+```python
+strategy = CosineStrategy(
+    linkage_method='complete',  # Alternative clustering method
+    max_dist=0.4,              # Larger clusters
+    model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'  # Multilingual support
+)
+```
+
+### Content Filtering Pipeline
+```python
+strategy = CosineStrategy(
+    semantic_filter="pricing plans features",
+    word_count_threshold=15,
+    sim_threshold=0.5,
+    top_k=3
+)
+
+async def extract_pricing_features(url: str):
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url=url,
+            extraction_strategy=strategy
+        )
+        
+        if result.success:
+            content = json.loads(result.extracted_content)
+            return {
+                'pricing_features': content,
+                'clusters': len(content),
+                'similarity_scores': [item['score'] for item in content]
+            }
+```
+
+## Best Practices
+
+1. **Adjust Thresholds Iteratively**
+   - Start with default values
+   - Adjust based on results
+   - Monitor clustering quality
+
+2. **Choose Appropriate Word Count Thresholds**
+   - Higher for articles (100+)
+   - Lower for reviews/comments (20+)
+   - Medium for product descriptions (50+)
+
+3. **Optimize Performance**
+   ```python
+   strategy = CosineStrategy(
+       word_count_threshold=10,  # Filter early
+       top_k=5,                 # Limit results
+       verbose=True             # Monitor performance
+   )
+   ```
+
+4. **Handle Different Content Types**
+   ```python
+   # For mixed content pages
+   strategy = CosineStrategy(
+       semantic_filter="product features",
+       sim_threshold=0.4,      # More flexible matching
+       max_dist=0.3,          # Larger clusters
+       top_k=3                # Multiple relevant sections
+   )
+   ```
+
+## Error Handling
+
+```python
+try:
+    result = await crawler.arun(
+        url="https://example.com",
+        extraction_strategy=strategy
+    )
+    
+    if result.success:
+        content = json.loads(result.extracted_content)
+        if not content:
+            print("No relevant content found")
+    else:
+        print(f"Extraction failed: {result.error_message}")
+        
+except Exception as e:
+    print(f"Error during extraction: {str(e)}")
+```
+
+The Cosine Strategy is particularly effective when:
+- Content structure is inconsistent
+- You need semantic understanding
+- You want to find similar content blocks
+- Structure-based extraction (CSS/XPath) isn't reliable
+
+It works well with other strategies and can be used as a pre-processing step for LLM-based extraction.
+```
+
+
+## File: docs/md_v2/extraction/llm-strategies.md
+
+```md
+# Extracting JSON (LLM)
+
+In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:
+
+1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more).  
+2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.  
+3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.
+
+**Important**: LLM-based extraction can be slower and costlier than schema-based approaches. If your page data is highly structured, consider using [`JsonCssExtractionStrategy`](./no-llm-strategies.md) or [`JsonXPathExtractionStrategy`](./no-llm-strategies.md) first. But if you need AI to interpret or reorganize content, read on!
+
+---
+
+## 1. Why Use an LLM?
+
+- **Complex Reasoning**: If the site’s data is unstructured, scattered, or full of natural language context.  
+- **Semantic Extraction**: Summaries, knowledge graphs, or relational data that require comprehension.  
+- **Flexible**: You can pass instructions to the model to do more advanced transformations or classification.
+
+---
+
+## 2. Provider-Agnostic via LightLLM
+
+Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide:
+
+- **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).  
+- **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.  
+- **`api_base`** (optional): If your provider has a custom endpoint.  
+
+This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily.
+
+---
+
+## 3. How LLM Extraction Works
+
+### 3.1 Flow
+
+1. **Chunking** (optional): The HTML or markdown is split into smaller segments if it’s very long (based on `chunk_token_threshold`, overlap, etc.).  
+2. **Prompt Construction**: For each chunk, the library forms a prompt that includes your **`instruction`** (and possibly schema or examples).  
+3. **LLM Inference**: Each chunk is sent to the model in parallel or sequentially (depending on your concurrency).  
+4. **Combining**: The results from each chunk are merged and parsed into JSON.
+
+### 3.2 `extraction_type`
+
+- **`"schema"`**: The model tries to return JSON conforming to your Pydantic-based schema.  
+- **`"block"`**: The model returns freeform text, or smaller JSON structures, which the library collects.  
+
+For structured data, `"schema"` is recommended. You provide `schema=YourPydanticModel.model_json_schema()`.
+
+---
+
+## 4. Key Parameters
+
+Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
+
+1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.  
+2. **`api_token`** (str): The API key or token for that model. May not be needed for local models.  
+3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
+4. **`extraction_type`** (str): `"schema"` or `"block"`.  
+5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
+6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.  
+7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.  
+8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.  
+9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:  
+   - `"markdown"`: The raw markdown (default).  
+   - `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.  
+   - `"html"`: The cleaned or raw HTML.  
+10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.  
+11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).  
+
+**Example**:
+
+```python
+extraction_strategy = LLMExtractionStrategy(
+    llm_config = LLMConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
+    schema=MyModel.model_json_schema(),
+    extraction_type="schema",
+    instruction="Extract a list of items from the text with 'name' and 'price' fields.",
+    chunk_token_threshold=1200,
+    overlap_rate=0.1,
+    apply_chunking=True,
+    input_format="html",
+    extra_args={"temperature": 0.1, "max_tokens": 1000},
+    verbose=True
+)
+```
+
+---
+
+## 5. Putting It in `CrawlerRunConfig`
+
+**Important**: In Crawl4AI, all strategy definitions should go inside the `CrawlerRunConfig`, not directly as a param in `arun()`. Here’s a full example:
+
+```python
+import os
+import asyncio
+import json
+from pydantic import BaseModel, Field
+from typing import List
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class Product(BaseModel):
+    name: str
+    price: str
+
+async def main():
+    # 1. Define the LLM extraction strategy
+    llm_strategy = LLMExtractionStrategy(
+        llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
+        schema=Product.schema_json(), # Or use model_json_schema()
+        extraction_type="schema",
+        instruction="Extract all product objects with 'name' and 'price' from the content.",
+        chunk_token_threshold=1000,
+        overlap_rate=0.0,
+        apply_chunking=True,
+        input_format="markdown",   # or "html", "fit_markdown"
+        extra_args={"temperature": 0.0, "max_tokens": 800}
+    )
+
+    # 2. Build the crawler config
+    crawl_config = CrawlerRunConfig(
+        extraction_strategy=llm_strategy,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    # 3. Create a browser config if needed
+    browser_cfg = BrowserConfig(headless=True)
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        # 4. Let's say we want to crawl a single page
+        result = await crawler.arun(
+            url="https://example.com/products",
+            config=crawl_config
+        )
+
+        if result.success:
+            # 5. The extracted content is presumably JSON
+            data = json.loads(result.extracted_content)
+            print("Extracted items:", data)
+            
+            # 6. Show usage stats
+            llm_strategy.show_usage()  # prints token usage
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 6. Chunking Details
+
+### 6.1 `chunk_token_threshold`
+
+If your page is large, you might exceed your LLM’s context window. **`chunk_token_threshold`** sets the approximate max tokens per chunk. The library calculates word→token ratio using `word_token_rate` (often ~0.75 by default). If chunking is enabled (`apply_chunking=True`), the text is split into segments.
+
+### 6.2 `overlap_rate`
+
+To keep context continuous across chunks, we can overlap them. E.g., `overlap_rate=0.1` means each subsequent chunk includes 10% of the previous chunk’s text. This is helpful if your needed info might straddle chunk boundaries.
+
+### 6.3 Performance & Parallelism
+
+By chunking, you can potentially process multiple chunks in parallel (depending on your concurrency settings and the LLM provider). This reduces total time if the site is huge or has many sections.
+
+---
+
+## 7. Input Format
+
+By default, **LLMExtractionStrategy** uses `input_format="markdown"`, meaning the **crawler’s final markdown** is fed to the LLM. You can change to:
+
+- **`html`**: The cleaned HTML or raw HTML (depending on your crawler config) goes into the LLM.  
+- **`fit_markdown`**: If you used, for instance, `PruningContentFilter`, the “fit” version of the markdown is used. This can drastically reduce tokens if you trust the filter.  
+- **`markdown`**: Standard markdown output from the crawler’s `markdown_generator`.
+
+This setting is crucial: if the LLM instructions rely on HTML tags, pick `"html"`. If you prefer a text-based approach, pick `"markdown"`.
+
+```python
+LLMExtractionStrategy(
+    # ...
+    input_format="html",  # Instead of "markdown" or "fit_markdown"
+)
+```
+
+---
+
+## 8. Token Usage & Show Usage
+
+To keep track of tokens and cost, each chunk is processed with an LLM call. We record usage in:
+
+- **`usages`** (list): token usage per chunk or call.  
+- **`total_usage`**: sum of all chunk calls.  
+- **`show_usage()`**: prints a usage report (if the provider returns usage data).
+
+```python
+llm_strategy = LLMExtractionStrategy(...)
+# ...
+llm_strategy.show_usage()
+# e.g. “Total usage: 1241 tokens across 2 chunk calls”
+```
+
+If your model provider doesn’t return usage info, these fields might be partial or empty.
+
+---
+
+## 9. Example: Building a Knowledge Graph
+
+Below is a snippet combining **`LLMExtractionStrategy`** with a Pydantic schema for a knowledge graph. Notice how we pass an **`instruction`** telling the model what to parse.
+
+```python
+import os
+import json
+import asyncio
+from typing import List
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class Entity(BaseModel):
+    name: str
+    description: str
+
+class Relationship(BaseModel):
+    entity1: Entity
+    entity2: Entity
+    description: str
+    relation_type: str
+
+class KnowledgeGraph(BaseModel):
+    entities: List[Entity]
+    relationships: List[Relationship]
+
+async def main():
+    # LLM extraction strategy
+    llm_strat = LLMExtractionStrategy(
+        provider="openai/gpt-4",
+        api_token=os.getenv('OPENAI_API_KEY'),
+        schema=KnowledgeGraph.schema_json(),
+        extraction_type="schema",
+        instruction="Extract entities and relationships from the content. Return valid JSON.",
+        chunk_token_threshold=1400,
+        apply_chunking=True,
+        input_format="html",
+        extra_args={"temperature": 0.1, "max_tokens": 1500}
+    )
+
+    crawl_config = CrawlerRunConfig(
+        extraction_strategy=llm_strat,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
+        # Example page
+        url = "https://www.nbcnews.com/business"
+        result = await crawler.arun(url=url, config=crawl_config)
+
+        if result.success:
+            with open("kb_result.json", "w", encoding="utf-8") as f:
+                f.write(result.extracted_content)
+            llm_strat.show_usage()
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Observations**:
+
+- **`extraction_type="schema"`** ensures we get JSON fitting our `KnowledgeGraph`.  
+- **`input_format="html"`** means we feed HTML to the model.  
+- **`instruction`** guides the model to output a structured knowledge graph.  
+
+---
+
+## 10. Best Practices & Caveats
+
+1. **Cost & Latency**: LLM calls can be slow or expensive. Consider chunking or smaller coverage if you only need partial data.  
+2. **Model Token Limits**: If your page + instruction exceed the context window, chunking is essential.  
+3. **Instruction Engineering**: Well-crafted instructions can drastically improve output reliability.  
+4. **Schema Strictness**: `"schema"` extraction tries to parse the model output as JSON. If the model returns invalid JSON, partial extraction might happen, or you might get an error.  
+5. **Parallel vs. Serial**: The library can process multiple chunks in parallel, but you must watch out for rate limits on certain providers.  
+6. **Check Output**: Sometimes, an LLM might omit fields or produce extraneous text. You may want to post-validate with Pydantic or do additional cleanup.
+
+---
+
+## 11. Conclusion
+
+**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
+
+- Put your LLM strategy **in `CrawlerRunConfig`**.  
+- Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees.  
+- Tweak **`chunk_token_threshold`**, **`overlap_rate`**, and **`apply_chunking`** to handle large content efficiently.  
+- Monitor token usage with `show_usage()`.
+
+If your site’s data is consistent or repetitive, consider [`JsonCssExtractionStrategy`](./no-llm-strategies.md) first for speed and simplicity. But if you need an **AI-driven** approach, `LLMExtractionStrategy` offers a flexible, multi-provider solution for extracting structured JSON from any website.
+
+**Next Steps**:
+
+1. **Experiment with Different Providers**  
+   - Try switching the `provider` (e.g., `"ollama/llama2"`, `"openai/gpt-4o"`, etc.) to see differences in speed, accuracy, or cost.  
+   - Pass different `extra_args` like `temperature`, `top_p`, and `max_tokens` to fine-tune your results.
+
+2. **Performance Tuning**  
+   - If pages are large, tweak `chunk_token_threshold`, `overlap_rate`, or `apply_chunking` to optimize throughput.  
+   - Check the usage logs with `show_usage()` to keep an eye on token consumption and identify potential bottlenecks.
+
+3. **Validate Outputs**  
+   - If using `extraction_type="schema"`, parse the LLM’s JSON with a Pydantic model for a final validation step.  
+   - Log or handle any parse errors gracefully, especially if the model occasionally returns malformed JSON.
+
+4. **Explore Hooks & Automation**  
+   - Integrate LLM extraction with [hooks](../advanced/hooks-auth.md) for complex pre/post-processing.  
+   - Use a multi-step pipeline: crawl, filter, LLM-extract, then store or index results for further analysis.
+
+**Last Updated**: 2025-01-01
+
+---
+
+That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
+```
+
+
+## File: docs/md_v2/extraction/no-llm-strategies.md
+
+```md
+# Extracting JSON (No LLM)
+
+One of Crawl4AI’s **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. By defining a **schema** with CSS or XPath selectors, you can extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM.
+
+**Why avoid LLM for basic extractions?**
+
+1. **Faster & Cheaper**: No API calls or GPU overhead.  
+2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. A well-defined schema is practically carbon-free.  
+3. **Precise & Repeatable**: CSS/XPath selectors do exactly what you specify. LLM outputs can vary or hallucinate.  
+4. **Scales Readily**: For thousands of pages, schema-based extraction runs quickly and in parallel.
+
+Below, we’ll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We’ll also highlight advanced features like **nested fields** and **base element attributes**.
+
+---
+
+## 1. Intro to Schema-Based Extraction
+
+A schema defines:
+
+1. A **base selector** that identifies each “container” element on the page (e.g., a product row, a blog post card).  
+2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).  
+3. **Nested** or **list** types for repeated or hierarchical structures.  
+
+For example, if you have a list of products, each one might have a name, price, reviews, and “related products.” This approach is faster and more reliable than an LLM for consistent, structured pages.
+
+---
+
+## 2. Simple Example: Crypto Prices
+
+Let’s begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **don’t** call any LLM:
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def extract_crypto_prices():
+    # 1. Define a simple extraction schema
+    schema = {
+        "name": "Crypto Prices",
+        "baseSelector": "div.crypto-row",    # Repeated elements
+        "fields": [
+            {
+                "name": "coin_name",
+                "selector": "h2.coin-name",
+                "type": "text"
+            },
+            {
+                "name": "price",
+                "selector": "span.coin-price",
+                "type": "text"
+            }
+        ]
+    }
+
+    # 2. Create the extraction strategy
+    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+
+    # 3. Set up your crawler config (if needed)
+    config = CrawlerRunConfig(
+        # e.g., pass js_code or wait_for if the page is dynamic
+        # wait_for="css:.crypto-row:nth-child(20)"
+        cache_mode = CacheMode.BYPASS,
+        extraction_strategy=extraction_strategy,
+    )
+
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # 4. Run the crawl and extraction
+        result = await crawler.arun(
+            url="https://example.com/crypto-prices",
+            
+            config=config
+        )
+
+        if not result.success:
+            print("Crawl failed:", result.error_message)
+            return
+
+        # 5. Parse the extracted JSON
+        data = json.loads(result.extracted_content)
+        print(f"Extracted {len(data)} coin entries")
+        print(json.dumps(data[0], indent=2) if data else "No data found")
+
+asyncio.run(extract_crypto_prices())
+```
+
+**Highlights**:
+
+- **`baseSelector`**: Tells us where each “item” (crypto row) is.  
+- **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors.  
+- Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.).
+
+No LLM is needed, and the performance is **near-instant** for hundreds or thousands of items.
+
+---
+
+### **XPath Example with `raw://` HTML**
+
+Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. We’ll pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`.
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
+
+async def extract_crypto_prices_xpath():
+    # 1. Minimal dummy HTML with some repeating rows
+    dummy_html = """
+    <html>
+      <body>
+        <div class='crypto-row'>
+          <h2 class='coin-name'>Bitcoin</h2>
+          <span class='coin-price'>$28,000</span>
+        </div>
+        <div class='crypto-row'>
+          <h2 class='coin-name'>Ethereum</h2>
+          <span class='coin-price'>$1,800</span>
+        </div>
+      </body>
+    </html>
+    """
+
+    # 2. Define the JSON schema (XPath version)
+    schema = {
+        "name": "Crypto Prices via XPath",
+        "baseSelector": "//div[@class='crypto-row']",
+        "fields": [
+            {
+                "name": "coin_name",
+                "selector": ".//h2[@class='coin-name']",
+                "type": "text"
+            },
+            {
+                "name": "price",
+                "selector": ".//span[@class='coin-price']",
+                "type": "text"
+            }
+        ]
+    }
+
+    # 3. Place the strategy in the CrawlerRunConfig
+    config = CrawlerRunConfig(
+        extraction_strategy=JsonXPathExtractionStrategy(schema, verbose=True)
+    )
+
+    # 4. Use raw:// scheme to pass dummy_html directly
+    raw_url = f"raw://{dummy_html}"
+
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url=raw_url,
+            config=config
+        )
+
+        if not result.success:
+            print("Crawl failed:", result.error_message)
+            return
+
+        data = json.loads(result.extracted_content)
+        print(f"Extracted {len(data)} coin rows")
+        if data:
+            print("First item:", data[0])
+
+asyncio.run(extract_crypto_prices_xpath())
+```
+
+**Key Points**:
+
+1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.  
+2. **`baseSelector`** and each field’s `"selector"` use **XPath** instead of CSS.  
+3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.  
+4. Everything (including the extraction strategy) is in **`CrawlerRunConfig`**.  
+
+That’s how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`.
+
+---
+
+## 3. Advanced Schema & Nested Structures
+
+Real sites often have **nested** or repeated data—like categories containing products, which themselves have a list of reviews or features. For that, we can define **nested** or **list** (and even **nested_list**) fields.
+
+### Sample E-Commerce HTML
+
+We have a **sample e-commerce** HTML file on GitHub (example):
+```
+https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
+```
+This snippet includes categories, products, features, reviews, and related items. Let’s see how to define a schema that fully captures that structure **without LLM**.
+
+```python
+schema = {
+    "name": "E-commerce Product Catalog",
+    "baseSelector": "div.category",
+    # (1) We can define optional baseFields if we want to extract attributes 
+    # from the category container
+    "baseFields": [
+        {"name": "data_cat_id", "type": "attribute", "attribute": "data-cat-id"}, 
+    ],
+    "fields": [
+        {
+            "name": "category_name",
+            "selector": "h2.category-name",
+            "type": "text"
+        },
+        {
+            "name": "products",
+            "selector": "div.product",
+            "type": "nested_list",    # repeated sub-objects
+            "fields": [
+                {
+                    "name": "name",
+                    "selector": "h3.product-name",
+                    "type": "text"
+                },
+                {
+                    "name": "price",
+                    "selector": "p.product-price",
+                    "type": "text"
+                },
+                {
+                    "name": "details",
+                    "selector": "div.product-details",
+                    "type": "nested",  # single sub-object
+                    "fields": [
+                        {
+                            "name": "brand",
+                            "selector": "span.brand",
+                            "type": "text"
+                        },
+                        {
+                            "name": "model",
+                            "selector": "span.model",
+                            "type": "text"
+                        }
+                    ]
+                },
+                {
+                    "name": "features",
+                    "selector": "ul.product-features li",
+                    "type": "list",
+                    "fields": [
+                        {"name": "feature", "type": "text"} 
+                    ]
+                },
+                {
+                    "name": "reviews",
+                    "selector": "div.review",
+                    "type": "nested_list",
+                    "fields": [
+                        {
+                            "name": "reviewer", 
+                            "selector": "span.reviewer", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "rating", 
+                            "selector": "span.rating", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "comment", 
+                            "selector": "p.review-text", 
+                            "type": "text"
+                        }
+                    ]
+                },
+                {
+                    "name": "related_products",
+                    "selector": "ul.related-products li",
+                    "type": "list",
+                    "fields": [
+                        {
+                            "name": "name", 
+                            "selector": "span.related-name", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "price", 
+                            "selector": "span.related-price", 
+                            "type": "text"
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
+```
+
+Key Takeaways:
+
+- **Nested vs. List**:  
+  - **`type: "nested"`** means a **single** sub-object (like `details`).  
+  - **`type: "list"`** means multiple items that are **simple** dictionaries or single text fields.  
+  - **`type: "nested_list"`** means repeated **complex** objects (like `products` or `reviews`).
+- **Base Fields**: We can extract **attributes** from the container element via `"baseFields"`. For instance, `"data_cat_id"` might be `data-cat-id="elect123"`.  
+- **Transforms**: We can also define a `transform` if we want to lower/upper case, strip whitespace, or even run a custom function.
+
+### Running the Extraction
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+ecommerce_schema = {
+    # ... the advanced schema from above ...
+}
+
+async def extract_ecommerce_data():
+    strategy = JsonCssExtractionStrategy(ecommerce_schema, verbose=True)
+    
+    config = CrawlerRunConfig()
+    
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
+            extraction_strategy=strategy,
+            config=config
+        )
+
+        if not result.success:
+            print("Crawl failed:", result.error_message)
+            return
+        
+        # Parse the JSON output
+        data = json.loads(result.extracted_content)
+        print(json.dumps(data, indent=2) if data else "No data found.")
+
+asyncio.run(extract_ecommerce_data())
+```
+
+If all goes well, you get a **structured** JSON array with each “category,” containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM.
+
+---
+
+## 4. Why “No LLM” Is Often Better
+
+1. **Zero Hallucination**: Schema-based extraction doesn’t guess text. It either finds it or not.  
+2. **Guaranteed Structure**: The same schema yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.  
+3. **Speed**: LLM-based extraction can be 10–1000x slower for large-scale crawling.  
+4. **Scalable**: Adding or updating a field is a matter of adjusting the schema, not re-tuning a model.
+
+**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema approach first for repeated or consistent data patterns.
+
+---
+
+## 5. Base Element Attributes & Additional Fields
+
+It’s easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using:
+
+```json
+{
+  "name": "href",
+  "type": "attribute",
+  "attribute": "href",
+  "default": null
+}
+```
+
+You can define them in **`baseFields`** (extracted from the main container element) or in each field’s sub-lists. This is especially helpful if you need an item’s link or ID stored in the parent `<div>`.
+
+---
+
+## 6. Putting It All Together: Larger Example
+
+Consider a blog site. We have a schema that extracts the **URL** from each post card (via `baseFields` with an `"attribute": "href"`), plus the title, date, summary, and author:
+
+```python
+schema = {
+  "name": "Blog Posts",
+  "baseSelector": "a.blog-post-card",
+  "baseFields": [
+    {"name": "post_url", "type": "attribute", "attribute": "href"}
+  ],
+  "fields": [
+    {"name": "title", "selector": "h2.post-title", "type": "text", "default": "No Title"},
+    {"name": "date", "selector": "time.post-date", "type": "text", "default": ""},
+    {"name": "summary", "selector": "p.post-summary", "type": "text", "default": ""},
+    {"name": "author", "selector": "span.post-author", "type": "text", "default": ""}
+  ]
+}
+```
+
+Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post objects, each with `"post_url"`, `"title"`, `"date"`, `"summary"`, `"author"`.
+
+---
+
+## 7. Tips & Best Practices
+
+1. **Inspect the DOM** in Chrome DevTools or Firefox’s Inspector to find stable selectors.  
+2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.  
+3. **Test** your schema on partial HTML or a test page before a big crawl.  
+4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.  
+5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it’ll often show warnings.  
+6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the “parent” item.  
+7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
+
+---
+
+## 8. Schema Generation Utility
+
+While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when:
+
+1. You're dealing with a new website structure and want a quick starting point
+2. You need to extract complex nested data structures
+3. You want to avoid the learning curve of CSS/XPath selector syntax
+
+### Using the Schema Generator
+
+The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+from crawl4ai import LLMConfig
+
+# Sample HTML with product information
+html = """
+<div class="product-card">
+    <h2 class="title">Gaming Laptop</h2>
+    <div class="price">$999.99</div>
+    <div class="specs">
+        <ul>
+            <li>16GB RAM</li>
+            <li>1TB SSD</li>
+        </ul>
+    </div>
+</div>
+"""
+
+# Option 1: Using OpenAI (requires API token)
+css_schema = JsonCssExtractionStrategy.generate_schema(
+    html,
+    schema_type="css", 
+    llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")
+)
+
+# Option 2: Using Ollama (open source, no token needed)
+xpath_schema = JsonXPathExtractionStrategy.generate_schema(
+    html,
+    schema_type="xpath",
+    llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)  # Not needed for Ollama
+)
+
+# Use the generated schema for fast, repeated extractions
+strategy = JsonCssExtractionStrategy(css_schema)
+```
+
+### LLM Provider Options
+
+1. **OpenAI GPT-4 (`openai/gpt4o`)**
+   - Default provider
+   - Requires an API token
+   - Generally provides more accurate schemas
+   - Set via environment variable: `OPENAI_API_KEY`
+
+2. **Ollama (`ollama/llama3.3`)**
+   - Open source alternative
+   - No API token required
+   - Self-hosted option
+   - Good for development and testing
+
+### Benefits of Schema Generation
+
+1. **One-Time Cost**: While schema generation uses LLM, it's a one-time cost. The generated schema can be reused for unlimited extractions without further LLM calls.
+2. **Smart Pattern Recognition**: The LLM analyzes the HTML structure and identifies common patterns, often producing more robust selectors than manual attempts.
+3. **Automatic Nesting**: Complex nested structures are automatically detected and properly represented in the schema.
+4. **Learning Tool**: The generated schemas serve as excellent examples for learning how to write your own schemas.
+
+### Best Practices
+
+1. **Review Generated Schemas**: While the generator is smart, always review and test the generated schema before using it in production.
+2. **Provide Representative HTML**: The better your sample HTML represents the overall structure, the more accurate the generated schema will be.
+3. **Consider Both CSS and XPath**: Try both schema types and choose the one that works best for your specific case.
+4. **Cache Generated Schemas**: Since generation uses LLM, save successful schemas for reuse.
+5. **API Token Security**: Never hardcode API tokens. Use environment variables or secure configuration management.
+6. **Choose Provider Wisely**: 
+   - Use OpenAI for production-quality schemas
+   - Use Ollama for development, testing, or when you need a self-hosted solution
+
+That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
+
+---
+
+## 9. Conclusion
+
+With **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy**), you can build powerful, **LLM-free** pipelines that:
+
+- Scrape any consistent site for structured data.  
+- Support nested objects, repeating lists, or advanced transformations.  
+- Scale to thousands of pages quickly and reliably.
+
+**Next Steps**:
+
+- Combine your extracted JSON with advanced filtering or summarization in a second pass if needed.  
+- For dynamic pages, combine strategies with `js_code` or infinite scroll hooking to ensure all content is loaded.
+
+**Remember**: For repeated, structured data, you don’t need to pay for or wait on an LLM. A well-crafted schema plus CSS or XPath gets you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI.
+
+**Last Updated**: 2025-01-01
+
+---
+
+That’s it for **Extracting JSON (No LLM)**! You’ve seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
+```
+
diff --git a/deploy/docker/mcp_bridge.py b/deploy/docker/mcp_bridge.py
new file mode 100644
index 00000000..c55ed14c
--- /dev/null
+++ b/deploy/docker/mcp_bridge.py
@@ -0,0 +1,252 @@
+# deploy/docker/mcp_bridge.py
+
+from __future__ import annotations
+import inspect, json, re, anyio
+from contextlib import suppress
+from typing import Any, Callable, Dict, List, Tuple
+import httpx
+
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
+from fastapi.responses import JSONResponse
+from fastapi import Request
+from sse_starlette.sse import EventSourceResponse
+from pydantic import BaseModel
+from mcp.server.sse import SseServerTransport
+
+import mcp.types as t
+from mcp.server.lowlevel.server import Server, NotificationOptions
+from mcp.server.models import InitializationOptions
+
+# ── opt‑in decorators ───────────────────────────────────────────
+def mcp_resource(name: str | None = None):
+    def deco(fn):
+        fn.__mcp_kind__, fn.__mcp_name__ = "resource", name
+        return fn
+    return deco
+
+def mcp_template(name: str | None = None):
+    def deco(fn):
+        fn.__mcp_kind__, fn.__mcp_name__ = "template", name
+        return fn
+    return deco
+
+def mcp_tool(name: str | None = None):
+    def deco(fn):
+        fn.__mcp_kind__, fn.__mcp_name__ = "tool", name
+        return fn
+    return deco
+
+# ── HTTP‑proxy helper for FastAPI endpoints ─────────────────────
+def _make_http_proxy(base_url: str, route):
+    method = list(route.methods - {"HEAD", "OPTIONS"})[0]
+    async def proxy(**kwargs):
+        # replace `/items/{id}` style params first
+        path = route.path
+        for k, v in list(kwargs.items()):
+            placeholder = "{" + k + "}"
+            if placeholder in path:
+                path = path.replace(placeholder, str(v))
+                kwargs.pop(k)
+        url = base_url.rstrip("/") + path
+
+        async with httpx.AsyncClient() as client:
+            try:
+                r = (
+                    await client.get(url, params=kwargs)
+                    if method == "GET"
+                    else await client.request(method, url, json=kwargs)
+                )
+                r.raise_for_status()
+                return r.text if method == "GET" else r.json()
+            except httpx.HTTPStatusError as e:
+                # surface FastAPI error details instead of plain 500
+                raise HTTPException(e.response.status_code, e.response.text)
+    return proxy
+
+# ── main entry point ────────────────────────────────────────────
+def attach_mcp(
+    app: FastAPI,
+    *,                          # keyword‑only
+    base: str = "/mcp",
+    name: str | None = None,
+    base_url: str,              # eg. "http://127.0.0.1:8020"
+) -> None:
+    """Call once after all routes are declared to expose WS+SSE MCP endpoints."""
+    server_name = name or app.title or "FastAPI-MCP"
+    mcp = Server(server_name)
+
+    # tools: Dict[str, Callable] = {}
+    tools: Dict[str, Tuple[Callable, Callable]] = {}
+    resources: Dict[str, Callable] = {}
+    templates: Dict[str, Callable] = {}
+
+    # register decorated FastAPI routes
+    for route in app.routes:
+        fn = getattr(route, "endpoint", None)
+        kind = getattr(fn, "__mcp_kind__", None)
+        if not kind:
+            continue
+
+        key = fn.__mcp_name__ or re.sub(r"[/{}}]", "_", route.path).strip("_")
+
+        # if kind == "tool":
+        #     tools[key] = _make_http_proxy(base_url, route)
+        if kind == "tool":
+            proxy = _make_http_proxy(base_url, route)
+            tools[key] = (proxy, fn)
+            continue
+        if kind == "resource":
+            resources[key] = fn
+        if kind == "template":
+            templates[key] = fn
+
+    # helpers for JSON‑Schema
+    def _schema(model: type[BaseModel] | None) -> dict:
+        return {"type": "object"} if model is None else model.model_json_schema()
+
+    def _body_model(fn: Callable) -> type[BaseModel] | None:
+        for p in inspect.signature(fn).parameters.values():
+            a = p.annotation
+            if inspect.isclass(a) and issubclass(a, BaseModel):
+                return a
+        return None
+
+    # MCP handlers
+    @mcp.list_tools()
+    async def _list_tools() -> List[t.Tool]:
+        out = []
+        for k, (proxy, orig_fn) in tools.items():
+            desc   = getattr(orig_fn, "__mcp_description__", None) or inspect.getdoc(orig_fn) or ""
+            schema = getattr(orig_fn, "__mcp_schema__", None) or _schema(_body_model(orig_fn))
+            out.append(
+                t.Tool(name=k, description=desc, inputSchema=schema)
+            )
+        return out
+             
+
+    @mcp.call_tool()
+    async def _call_tool(name: str, arguments: Dict | None) -> List[t.TextContent]:
+        if name not in tools:
+            raise HTTPException(404, "tool not found")
+        
+        proxy, _ = tools[name]
+        try:
+            res = await proxy(**(arguments or {}))
+        except HTTPException as exc:
+            # map server‑side errors into MCP "text/error" payloads
+            err = {"error": exc.status_code, "detail": exc.detail}
+            return [t.TextContent(type = "text", text=json.dumps(err))]
+        return [t.TextContent(type = "text", text=json.dumps(res, default=str))]
+
+    @mcp.list_resources()
+    async def _list_resources() -> List[t.Resource]:
+        return [
+            t.Resource(name=k, description=inspect.getdoc(f) or "", mime_type="application/json")
+            for k, f in resources.items()
+        ]
+
+    @mcp.read_resource()
+    async def _read_resource(name: str) -> List[t.TextContent]:
+        if name not in resources:
+            raise HTTPException(404, "resource not found")
+        res = resources[name]()
+        return [t.TextContent(type = "text", text=json.dumps(res, default=str))]
+
+    @mcp.list_resource_templates()
+    async def _list_templates() -> List[t.ResourceTemplate]:
+        return [
+            t.ResourceTemplate(
+                name=k,
+                description=inspect.getdoc(f) or "",
+                parameters={
+                    p: {"type": "string"} for p in _path_params(app, f)
+                },
+            )
+            for k, f in templates.items()
+        ]
+
+    init_opts = InitializationOptions(
+        server_name=server_name,
+        server_version="0.1.0",
+        capabilities=mcp.get_capabilities(
+            notification_options=NotificationOptions(),
+            experimental_capabilities={},
+        ),
+    )
+
+    # ── WebSocket transport ────────────────────────────────────
+    @app.websocket_route(f"{base}/ws")
+    async def _ws(ws: WebSocket):
+        await ws.accept()
+        c2s_send, c2s_recv = anyio.create_memory_object_stream(100)
+        s2c_send, s2c_recv = anyio.create_memory_object_stream(100)
+
+        from pydantic import TypeAdapter
+        from mcp.types import JSONRPCMessage
+        adapter = TypeAdapter(JSONRPCMessage)
+
+        init_done = anyio.Event()
+
+        async def srv_to_ws():
+            first = True 
+            try:
+                async for msg in s2c_recv:
+                    await ws.send_json(msg.model_dump())
+                    if first:
+                        init_done.set()
+                        first = False
+            finally:
+                # make sure cleanup survives TaskGroup cancellation
+                with anyio.CancelScope(shield=True):
+                    with suppress(RuntimeError):       # idempotent close
+                        await ws.close()
+
+        async def ws_to_srv():
+            try:
+                # 1st frame is always "initialize"
+                first = adapter.validate_python(await ws.receive_json())
+                await c2s_send.send(first)
+                await init_done.wait()          # block until server ready
+                while True:
+                    data = await ws.receive_json()
+                    await c2s_send.send(adapter.validate_python(data))
+            except WebSocketDisconnect:
+                await c2s_send.aclose()
+
+        async with anyio.create_task_group() as tg:
+            tg.start_soon(mcp.run, c2s_recv, s2c_send, init_opts)
+            tg.start_soon(ws_to_srv)
+            tg.start_soon(srv_to_ws)
+
+    # ── SSE transport (official) ─────────────────────────────
+    sse = SseServerTransport(f"{base}/messages/")
+
+    @app.get(f"{base}/sse")
+    async def _mcp_sse(request: Request):
+        async with sse.connect_sse(
+            request.scope, request.receive, request._send  # starlette ASGI primitives
+        ) as (read_stream, write_stream):
+            await mcp.run(read_stream, write_stream, init_opts)
+
+    # client → server frames are POSTed here
+    app.mount(f"{base}/messages", app=sse.handle_post_message)
+
+    # ── schema endpoint ───────────────────────────────────────
+    @app.get(f"{base}/schema")
+    async def _schema_endpoint():
+        return JSONResponse({
+            "tools": [x.model_dump() for x in await _list_tools()],
+            "resources": [x.model_dump() for x in await _list_resources()],
+            "resource_templates": [x.model_dump() for x in await _list_templates()],
+        })
+
+
+# ── helpers ────────────────────────────────────────────────────
+def _route_name(path: str) -> str:
+    return re.sub(r"[/{}}]", "_", path).strip("_")
+
+def _path_params(app: FastAPI, fn: Callable) -> List[str]:
+    for r in app.routes:
+        if r.endpoint is fn:
+            return list(r.param_convertors.keys())
+    return []
diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt
index 40a33a79..0dbb684c 100644
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -1,9 +1,15 @@
-fastapi
-uvicorn
+fastapi==0.115.12
+uvicorn==0.34.2
 gunicorn>=23.0.0
-slowapi>=0.1.9
-prometheus-fastapi-instrumentator>=7.0.2
+slowapi==0.1.9
+prometheus-fastapi-instrumentator>=7.1.0
 redis>=5.2.1
 jwt>=1.3.1
 dnspython>=2.7.0
-email-validator>=2.2.0
\ No newline at end of file
+email-validator==2.2.0
+sse-starlette==2.2.1
+pydantic==2.11
+rank-bm25==0.2.2
+anyio==4.9.0
+PyJWT==2.10.1
+
diff --git a/deploy/docker/server.py b/deploy/docker/server.py
index ae60ffa2..7c02a74f 100644
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -7,14 +7,47 @@ Crawl4AI FastAPI entry‑point
 """
 
 # ── stdlib & 3rd‑party imports ───────────────────────────────
-import os, sys, time, asyncio
-from typing import List, Optional, Dict
+from crawler_pool import get_crawler, close_all, janitor
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from auth import create_access_token, get_token_dependency, TokenRequest
+from pydantic import BaseModel
+from typing import Optional, List, Dict
+from fastapi import Request, Depends 
+from fastapi.responses import FileResponse
+import base64
+import re
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from api import (
+    handle_markdown_request, handle_llm_qa,
+    handle_stream_crawl_request, handle_crawl_request,
+    stream_results
+)
+from utils import (
+    FilterType, load_config, setup_logging, verify_email_domain
+)
+import os
+import sys
+import time
+import asyncio
+from typing import List
 from contextlib import asynccontextmanager
 import pathlib
 
 from fastapi import (
     FastAPI, HTTPException, Request, Path, Query, Depends
 )
+from rank_bm25 import BM25Okapi
+
+def chunk_code_functions(code: str) -> List[str]:
+    tree = ast.parse(code)
+    lines = code.splitlines()
+    chunks = []
+    for node in tree.body:
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+            start = node.lineno - 1
+            end = getattr(node, 'end_lineno', start + 1)
+            chunks.append("\n".join(lines[start:end]))
+    return chunks
 from fastapi.responses import (
     StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
 )
@@ -22,7 +55,10 @@ from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
 from fastapi.middleware.trustedhost import TrustedHostMiddleware
 from fastapi.staticfiles import StaticFiles
 
-import ast, crawl4ai as _c4
+from mcp_bridge import attach_mcp, mcp_resource, mcp_template, mcp_tool
+
+import ast
+import crawl4ai as _c4
 from pydantic import BaseModel, Field
 from slowapi import Limiter
 from slowapi.util import get_remote_address
@@ -31,17 +67,6 @@ from redis import asyncio as aioredis
 
 # ── internal imports (after sys.path append) ─────────────────
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-from utils import (
-    FilterType, load_config, setup_logging, verify_email_domain
-)
-from api import (
-    handle_markdown_request, handle_llm_qa,
-    handle_stream_crawl_request, handle_crawl_request,
-    stream_results
-)
-from auth import create_access_token, get_token_dependency, TokenRequest
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-from crawler_pool import get_crawler, close_all, janitor
 
 # ────────────────── configuration / logging ──────────────────
 config = load_config()
@@ -66,12 +91,16 @@ GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
 #         GLOBAL_SEM.release()                          # ← free slot
 
 orig_arun = AsyncWebCrawler.arun
+
+
 async def capped_arun(self, *a, **kw):
     async with GLOBAL_SEM:
         return await orig_arun(self, *a, **kw)
 AsyncWebCrawler.arun = capped_arun
 
 # ───────────────────── FastAPI lifespan ──────────────────────
+
+
 @asynccontextmanager
 async def lifespan(_: FastAPI):
     await get_crawler(BrowserConfig(
@@ -101,6 +130,8 @@ app.mount(
 )
 
 # Optional nice‑to‑have: opening the root shows the playground
+
+
 @app.get("/")
 async def root():
     return RedirectResponse("/playground")
@@ -114,6 +145,7 @@ limiter = Limiter(
     storage_uri=config["rate_limiting"]["storage_uri"],
 )
 
+
 def _setup_security(app_: FastAPI):
     sec = config["security"]
     if not sec["enabled"]:
@@ -124,6 +156,8 @@ def _setup_security(app_: FastAPI):
         app_.add_middleware(
             TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"]
         )
+
+
 _setup_security(app)
 
 if config["observability"]["prometheus"]["enabled"]:
@@ -131,6 +165,7 @@ if config["observability"]["prometheus"]["enabled"]:
 
 token_dep = get_token_dependency(config)
 
+
 @app.middleware("http")
 async def add_security_headers(request: Request, call_next):
     resp = await call_next(request)
@@ -144,6 +179,7 @@ ALLOWED_TYPES = {
     "BrowserConfig": BrowserConfig,
 }
 
+
 def _safe_eval_config(expr: str) -> dict:
     """
     Accept exactly one top‑level call to CrawlerRunConfig(...) or BrowserConfig(...).
@@ -159,7 +195,8 @@ def _safe_eval_config(expr: str) -> dict:
 
     call = tree.body
     if not (isinstance(call.func, ast.Name) and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}):
-        raise ValueError("Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
+        raise ValueError(
+            "Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
 
     # forbid nested calls to keep the surface tiny
     for node in ast.walk(call):
@@ -167,8 +204,10 @@ def _safe_eval_config(expr: str) -> dict:
             raise ValueError("Nested function calls are not permitted")
 
     # expose everything that crawl4ai exports, nothing else
-    safe_env = {name: getattr(_c4, name) for name in dir(_c4) if not name.startswith("_")}
-    obj = eval(compile(tree, "<config>", "eval"), {"__builtins__": {}}, safe_env)
+    safe_env = {name: getattr(_c4, name)
+                for name in dir(_c4) if not name.startswith("_")}
+    obj = eval(compile(tree, "<config>", "eval"),
+               {"__builtins__": {}}, safe_env)
     return obj.dump()
 
 
@@ -178,10 +217,42 @@ class CrawlRequest(BaseModel):
     browser_config: Optional[Dict] = Field(default_factory=dict)
     crawler_config: Optional[Dict] = Field(default_factory=dict)
 
+# ────────────── Schemas ──────────────
+class MarkdownRequest(BaseModel):
+    """Request body for the /md endpoint."""
+    url: str                    = Field(...,  description="Absolute http/https URL to fetch")
+    f:   FilterType             = Field(FilterType.FIT,
+                                        description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
+    q:   Optional[str] = Field(None,  description="Query string used by BM25/LLM filters")
+    c:   Optional[str] = Field("0",   description="Cache‑bust / revision counter")
+
+
 class RawCode(BaseModel):
     code: str
 
+class HTMLRequest(BaseModel):
+    url: str
+    
+class ScreenshotRequest(BaseModel):
+    url: str
+    screenshot_wait_for: Optional[float] = 2
+    output_path: Optional[str] = None
+
+class PDFRequest(BaseModel):
+    url: str
+    output_path: Optional[str] = None
+
+
+class JSEndpointRequest(BaseModel):
+    url: str
+    scripts: List[str] = Field(
+        ...,
+        description="List of separated JavaScript snippets to execute"
+    )
+
 # ──────────────────────── Endpoints ──────────────────────────
+
+
 @app.post("/token")
 async def get_token(req: TokenRequest):
     if not verify_email_domain(req.email):
@@ -189,6 +260,7 @@ async def get_token(req: TokenRequest):
     token = create_access_token({"sub": req.email})
     return {"email": req.email, "access_token": token, "token_type": "bearer"}
 
+
 @app.post("/config/dump")
 async def config_dump(raw: RawCode):
     try:
@@ -197,18 +269,164 @@ async def config_dump(raw: RawCode):
         raise HTTPException(400, str(e))
 
 
-@app.get("/md/{url:path}")
+@app.post("/md")
 @limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("md")
 async def get_markdown(
     request: Request,
-    url: str,
-    f: FilterType = FilterType.FIT,
-    q: Optional[str] = None,
-    c: str = "0",
+    body: MarkdownRequest,
     _td: Dict = Depends(token_dep),
 ):
-    md = await handle_markdown_request(url, f, q, c, config)
-    return PlainTextResponse(md)
+    if not body.url.startswith(("http://", "https://")):
+        raise HTTPException(400, "URL must be absolute and start with http/https")
+    markdown = await handle_markdown_request(
+        body.url, body.f, body.q, body.c, config
+    )
+    return JSONResponse({
+        "url": body.url,
+        "filter": body.f,
+        "query": body.q,
+        "cache": body.c,
+        "markdown": markdown,
+        "success": True
+    })
+
+
+@app.post("/html")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("html")
+async def generate_html(
+    request: Request,
+    body: HTMLRequest,
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Crawls the URL, preprocesses the raw HTML for schema extraction, and returns the processed HTML.
+    Use when you need sanitized HTML structures for building schemas or further processing.
+    """
+    cfg = CrawlerRunConfig()
+    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+        results = await crawler.arun(url=body.url, config=cfg)
+    raw_html = results[0].html
+    from crawl4ai.utils import preprocess_html_for_schema
+    processed_html = preprocess_html_for_schema(raw_html)
+    return JSONResponse({"html": processed_html, "url": body.url, "success": True})
+
+# Screenshot endpoint
+
+@app.post("/screenshot")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("screenshot")
+async def generate_screenshot(
+    request: Request,
+    body: ScreenshotRequest, 
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Capture a full-page PNG screenshot of the specified URL, waiting an optional delay before capture,
+    Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
+    Then in result instead of the screenshot you will get a path to the saved file.
+    """
+    cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
+    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+        results = await crawler.arun(url=body.url, config=cfg)
+    screenshot_data = results[0].screenshot
+    if body.output_path:
+        abs_path = os.path.abspath(body.output_path)
+        os.makedirs(os.path.dirname(abs_path), exist_ok=True)
+        with open(abs_path, "wb") as f:
+            f.write(base64.b64decode(screenshot_data))
+        return {"success": True, "path": abs_path}
+    return {"success": True, "screenshot": screenshot_data}
+
+# PDF endpoint
+
+@app.post("/pdf")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("pdf")
+async def generate_pdf(
+    request: Request,
+    body: PDFRequest, 
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Generate a PDF document of the specified URL,
+    Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
+    Then in result instead of the PDF you will get a path to the saved file.
+    """
+    cfg = CrawlerRunConfig(pdf=True)
+    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+        results = await crawler.arun(url=body.url, config=cfg)
+    pdf_data = results[0].pdf
+    if body.output_path:
+        abs_path = os.path.abspath(body.output_path)
+        os.makedirs(os.path.dirname(abs_path), exist_ok=True)
+        with open(abs_path, "wb") as f:
+            f.write(pdf_data)
+        return {"success": True, "path": abs_path}
+    return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
+
+
+@app.post("/execute_js")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("execute_js")
+async def execute_js(
+    request: Request,
+    body: JSEndpointRequest,
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Execute a sequence of JavaScript snippets on the specified URL.
+    Return the full CrawlResult JSON (first result).
+    Use this when you need to interact with dynamic pages using JS.
+    REMEMBER: Scripts accept a list of separated JS snippets to execute and execute them in order.
+    IMPORTANT: Each script should be an expression that returns a value. It can be an IIFE or an async function. You can think of it as such.
+        Your script will replace '{script}' and execute in the browser context. So provide either an IIFE or a sync/async function that returns a value.
+    Return Format:
+        - The return result is an instance of CrawlResult, so you have access to markdown, links, and other stuff. If this is enough, you don't need to call again for other endpoints.
+        
+        ```python
+        class CrawlResult(BaseModel):
+            url: str
+            html: str
+            success: bool
+            cleaned_html: Optional[str] = None
+            media: Dict[str, List[Dict]] = {}
+            links: Dict[str, List[Dict]] = {}
+            downloaded_files: Optional[List[str]] = None
+            js_execution_result: Optional[Dict[str, Any]] = None
+            screenshot: Optional[str] = None
+            pdf: Optional[bytes] = None
+            mhtml: Optional[str] = None
+            _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
+            extracted_content: Optional[str] = None
+            metadata: Optional[dict] = None
+            error_message: Optional[str] = None
+            session_id: Optional[str] = None
+            response_headers: Optional[dict] = None
+            status_code: Optional[int] = None
+            ssl_certificate: Optional[SSLCertificate] = None
+            dispatch_result: Optional[DispatchResult] = None
+            redirected_url: Optional[str] = None
+            network_requests: Optional[List[Dict[str, Any]]] = None
+            console_messages: Optional[List[Dict[str, Any]]] = None
+
+        class MarkdownGenerationResult(BaseModel):
+            raw_markdown: str
+            markdown_with_citations: str
+            references_markdown: str
+            fit_markdown: Optional[str] = None
+            fit_html: Optional[str] = None
+        ```
+        
+    """
+    cfg = CrawlerRunConfig(js_code=body.scripts)
+    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+        results = await crawler.arun(url=body.url, config=cfg)
+    # Return JSON-serializable dict of the first CrawlResult
+    data = results[0].model_dump()
+    return JSONResponse(data)
+
 
 @app.get("/llm/{url:path}")
 async def llm_endpoint(
@@ -224,27 +442,35 @@ async def llm_endpoint(
     answer = await handle_llm_qa(url, q, config)
     return JSONResponse({"answer": answer})
 
+
 @app.get("/schema")
 async def get_schema():
     from crawl4ai import BrowserConfig, CrawlerRunConfig
     return {"browser": BrowserConfig().dump(),
             "crawler": CrawlerRunConfig().dump()}
 
+
 @app.get(config["observability"]["health_check"]["endpoint"])
 async def health():
     return {"status": "ok", "timestamp": time.time(), "version": __version__}
 
+
 @app.get(config["observability"]["prometheus"]["endpoint"])
 async def metrics():
     return RedirectResponse(config["observability"]["prometheus"]["endpoint"])
 
+
 @app.post("/crawl")
 @limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("crawl")
 async def crawl(
     request: Request,
     crawl_request: CrawlRequest,
     _td: Dict = Depends(token_dep),
 ):
+    """
+    Crawl a list of URLs and return the results as JSON.
+    """
     if not crawl_request.urls:
         raise HTTPException(400, "At least one URL required")
     res = await handle_crawl_request(
@@ -255,6 +481,7 @@ async def crawl(
     )
     return JSONResponse(res)
 
+
 @app.post("/crawl/stream")
 @limiter.limit(config["rate_limiting"]["default_limit"])
 async def crawl_stream(
@@ -280,6 +507,133 @@ async def crawl_stream(
         },
     )
 
+def chunk_code_functions(code_md: str) -> List[str]:
+    """Extract each function/class from markdown code blocks per file."""
+    pattern = re.compile(
+        # match "## File: <path>" then a ```py fence, then capture until the closing ```
+        r'##\s*File:\s*(?P<path>.+?)\s*?\r?\n'      # file header
+        r'```py\s*?\r?\n'                         # opening fence
+        r'(?P<code>.*?)(?=\r?\n```)',             # code block
+        re.DOTALL
+    )
+    chunks: List[str] = []
+    for m in pattern.finditer(code_md):
+        file_path = m.group("path").strip()
+        code_blk = m.group("code")
+        tree = ast.parse(code_blk)
+        lines = code_blk.splitlines()
+        for node in tree.body:
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+                start = node.lineno - 1
+                end = getattr(node, "end_lineno", start + 1)
+                snippet = "\n".join(lines[start:end])
+                chunks.append(f"# File: {file_path}\n{snippet}")
+    return chunks
+
+def chunk_doc_sections(doc: str) -> List[str]:
+    lines = doc.splitlines(keepends=True)
+    sections = []
+    current: List[str] = []
+    for line in lines:
+        if re.match(r"^#{1,6}\s", line):
+            if current:
+                sections.append("".join(current))
+            current = [line]
+        else:
+            current.append(line)
+    if current:
+        sections.append("".join(current))
+    return sections
+
+@app.get("/ask")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("ask")
+async def get_context(
+    request: Request,
+    _td: Dict = Depends(token_dep),
+    context_type: str = Query("all", regex="^(code|doc|all)$"),
+    query: Optional[str] = Query(None, description="search query to filter chunks"),
+    score_ratio: float = Query(0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"),
+    max_results: int = Query(20, ge=1, description="absolute cap on returned chunks"),
+):
+    """
+    This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai. 
+    You can use this as a context for any AI assistant. Use this endpoint for AI assistants to retrieve library context for decision making or code generation tasks.
+    Alway is BEST practice you provide a query to filter the context. Otherwise the lenght of the response will be very long.
+    
+    Parameters:
+    - context_type: Specify "code" for code context, "doc" for documentation context, or "all" for both.
+    - query: RECOMMENDED search query to filter paragraphs using BM25. You can leave this empty to get all the context.
+    - score_ratio: Minimum score as a fraction of the maximum score for filtering results.
+    - max_results: Maximum number of results to return. Default is 20.
+    
+    Returns:
+    - JSON response with the requested context.
+    - If "code" is specified, returns the code context.
+    - If "doc" is specified, returns the documentation context.
+    - If "all" is specified, returns both code and documentation contexts.
+    """
+    # load contexts
+    base = os.path.dirname(__file__)
+    code_path = os.path.join(base, "c4ai-code-context.md")
+    doc_path  = os.path.join(base, "c4ai-doc-context.md")
+    if not os.path.exists(code_path) or not os.path.exists(doc_path):
+        raise HTTPException(404, "Context files not found")
+
+    with open(code_path, "r") as f:
+        code_content = f.read()
+    with open(doc_path, "r") as f:
+        doc_content = f.read()
+
+    # if no query, just return raw contexts
+    if not query:
+        if context_type == "code":
+            return JSONResponse({"code_context": code_content})
+        if context_type == "doc":
+            return JSONResponse({"doc_context": doc_content})
+        return JSONResponse({
+            "code_context": code_content,
+            "doc_context": doc_content,
+        })
+
+    tokens = query.split()
+    results: Dict[str, List[Dict[str, float]]] = {}
+
+    # code BM25 over functions/classes
+    if context_type in ("code", "all"):
+        code_chunks = chunk_code_functions(code_content)
+        bm25 = BM25Okapi([c.split() for c in code_chunks])
+        scores = bm25.get_scores(tokens)
+        max_sc = float(scores.max()) if scores.size > 0 else 0.0
+        cutoff = max_sc * score_ratio
+        picked = [(c, s) for c, s in zip(code_chunks, scores) if s >= cutoff]
+        picked = sorted(picked, key=lambda x: x[1], reverse=True)[:max_results]
+        results["code_results"] = [{"text": c, "score": s} for c, s in picked]
+
+    # doc BM25 over markdown sections
+    if context_type in ("doc", "all"):
+        sections = chunk_doc_sections(doc_content)
+        bm25d = BM25Okapi([sec.split() for sec in sections])
+        scores_d = bm25d.get_scores(tokens)
+        max_sd = float(scores_d.max()) if scores_d.size > 0 else 0.0
+        cutoff_d = max_sd * score_ratio
+        idxs = [i for i, s in enumerate(scores_d) if s >= cutoff_d]
+        neighbors = set(i for idx in idxs for i in (idx-1, idx, idx+1))
+        valid = [i for i in sorted(neighbors) if 0 <= i < len(sections)]
+        valid = valid[:max_results]
+        results["doc_results"] = [
+            {"text": sections[i], "score": scores_d[i]} for i in valid
+        ]
+
+    return JSONResponse(results)
+    
+
+# attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema)
+attach_mcp(
+    app,
+    base_url=f"http://{config['app']['host']}:{config['app']['port']}"
+)
+
 # ────────────────────────── cli ──────────────────────────────
 if __name__ == "__main__":
     import uvicorn
diff --git a/tests/mcp/test_mcp_socket.py b/tests/mcp/test_mcp_socket.py
new file mode 100644
index 00000000..ecb3070f
--- /dev/null
+++ b/tests/mcp/test_mcp_socket.py
@@ -0,0 +1,119 @@
+# pip install "mcp-sdk[ws]" anyio
+import anyio, json
+from mcp.client.websocket import websocket_client
+from mcp.client.session import ClientSession
+
+async def test_list():
+    async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
+        async with ClientSession(r, w) as s:
+            await s.initialize()
+
+            print("tools      :", [t.name for t in (await s.list_tools()).tools])
+            print("resources  :", [r.name for r in (await s.list_resources()).resources])
+            print("templates  :", [t.name for t in (await s.list_resource_templates()).resource_templates])
+
+
+async def test_crawl(s: ClientSession) -> None:
+    """Hit the @mcp_tool('crawl') endpoint."""
+    res = await s.call_tool(
+        "crawl",
+        {
+            "urls": ["https://example.com"],
+            "browser_config": {},
+            "crawler_config": {},
+        },
+    )
+    print("crawl →", json.loads(res.content[0].text))
+
+
+async def test_md(s: ClientSession) -> None:
+    """Hit the @mcp_tool('md') endpoint."""
+    res = await s.call_tool(
+        "md",
+        {
+            "url": "https://example.com",
+            "f": "fit",   # or RAW, BM25, LLM
+            "q": None,
+            "c": "0",
+        },
+    )
+    result = json.loads(res.content[0].text)
+    print("md →", result['markdown'][:100], "...")
+
+async def test_screenshot(s: ClientSession):
+    res = await s.call_tool(
+        "screenshot",
+        {
+            "url": "https://example.com",
+            "screenshot_wait_for": 1.0,
+        },
+    )
+    png_b64 = json.loads(res.content[0].text)["screenshot"]
+    print("screenshot →", png_b64[:60], "… (base64)")
+
+
+async def test_pdf(s: ClientSession):
+    res = await s.call_tool(
+        "pdf",
+        {
+            "url": "https://example.com",
+        },
+    )
+    pdf_b64 = json.loads(res.content[0].text)["pdf"]
+    print("pdf →", pdf_b64[:60], "… (base64)")
+
+async def test_execute_js(s: ClientSession):
+    # click the “More” link on Hacker News front page and wait 1 s
+    res = await s.call_tool(
+        "execute_js",
+        {
+            "url": "https://news.ycombinator.com/news",
+            "js_code": [
+                "await page.click('a.morelink')",
+                "await page.waitForTimeout(1000)",
+            ],
+        },
+    )
+    crawl_result = json.loads(res.content[0].text)
+    print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
+    
+async def test_html(s: ClientSession):
+    # click the “More” link on Hacker News front page and wait 1 s
+    res = await s.call_tool(
+        "html",
+        {
+            "url": "https://news.ycombinator.com/news",
+        },
+    )
+    crawl_result = json.loads(res.content[0].text)
+    print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))    
+    
+async def test_context(s: ClientSession):
+    # click the “More” link on Hacker News front page and wait 1 s
+    res = await s.call_tool(
+        "ask",
+        {
+            "query": "I hv a question about Crawl4ai library, how to extract internal links when crawling a page?"
+        },
+    )
+    crawl_result = json.loads(res.content[0].text)
+    print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))    
+
+
+async def main() -> None:
+    async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
+        async with ClientSession(r, w) as s:
+            await s.initialize()                       # handshake
+            tools = (await s.list_tools()).tools
+            print("tools:", [t.name for t in tools])
+
+            # await test_list()
+            # await test_crawl(s)
+            # await test_md(s)
+            # await test_screenshot(s)
+            # await test_pdf(s)
+            # await test_execute_js(s)
+            # await test_html(s)
+            await test_context(s)
+
+anyio.run(main)
diff --git a/tests/mcp/test_mcp_sse.py b/tests/mcp/test_mcp_sse.py
new file mode 100644
index 00000000..d9eee557
--- /dev/null
+++ b/tests/mcp/test_mcp_sse.py
@@ -0,0 +1,11 @@
+from mcp.client.sse import sse_client
+from mcp.client.session import ClientSession
+
+async def main():
+    async with sse_client("http://127.0.0.1:8020/mcp") as (r, w):
+        async with ClientSession(r, w) as sess:
+            print(await sess.list_tools())      # now works
+            
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
diff --git a/tests/memory/test_docker_congif_gen.py b/tests/memory/test_docker_config_gen.py
similarity index 87%
rename from tests/memory/test_docker_congif_gen.py
rename to tests/memory/test_docker_config_gen.py
index 2da26078..ae6e533c 100644
--- a/tests/memory/test_docker_congif_gen.py
+++ b/tests/memory/test_docker_config_gen.py
@@ -11,7 +11,8 @@ If the server isn’t running, start it first:
 
 import sys, json, textwrap, requests
 
-BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
+# BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
+BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
 URL  = f"{BASE.rstrip('/')}/config/dump"
 
 CASES = [