diff --git a/.gitattributes b/.gitattributes
index 144fe136..0af13c51 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -9,4 +9,4 @@ docs/md_v2/* linguist-documentation
 *.py linguist-language=Python
 
 # Exclude HTML from language statistics
-*.html linguist-detectable=false
\ No newline at end of file
+*.html linguist-detectable=false
diff --git a/.gitignore b/.gitignore
index c7ebf2e4..4f469aa6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -226,8 +226,5 @@ tree.md
 .local
 .do
 /plans
-plans/
-
-# Codeium
 .codeiumignore
 todo/
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index afa841c9..55674100 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,9 @@
+### [Added] 2025-01-20
+- New LLM-powered schema generation utility for JsonElementExtractionStrategy
+- Support for automatic CSS and XPath schema generation using OpenAI or Ollama
+- Comprehensive documentation and examples for schema generation
+- New prompt templates optimized for HTML schema analysis
+
 # Changelog
 
 All notable changes to Crawl4AI will be documented in this file.
diff --git a/README.md b/README.md
index dbccf547..aacd72a9 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,18 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
 
 🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://docs.crawl4ai.com/blog)
 
+<details>
+<summary>🤓 <strong>My Personal Story</strong></summary>
+
+My journey with computers started in childhood when my dad, a computer scientist, introduced me to an Amstrad computer. Those early days sparked a fascination with technology, leading me to pursue computer science and specialize in NLP during my postgraduate studies. It was during this time that I first delved into web crawling, building tools to help researchers organize papers and extract information from publications a challenging yet rewarding experience that honed my skills in data extraction.
+
+Fast forward to 2023, I was working on a tool for a project and needed a crawler to convert a webpage into markdown. While exploring solutions, I found one that claimed to be open-source but required creating an account and generating an API token. Worse, it turned out to be a SaaS model charging $16, and its quality didn’t meet my standards. Frustrated, I realized this was a deeper problem. That frustration turned into turbo anger mode, and I decided to build my own solution. In just a few days, I created Crawl4AI. To my surprise, it went viral, earning thousands of GitHub stars and resonating with a global community.
+
+I made Crawl4AI open-source for two reasons. First, it’s my way of giving back to the open-source community that has supported me throughout my career. Second, I believe data should be accessible to everyone, not locked behind paywalls or monopolized by a few. Open access to data lays the foundation for the democratization of AI—a vision where individuals can train their own models and take ownership of their information. This library is the first step in a larger journey to create the best open-source data extraction and generation tool the world has ever seen, built collaboratively by a passionate community.
+
+Thank you to everyone who has supported this project, used it, and shared feedback. Your encouragement motivates me to dream even bigger. Join us, file issues, submit PRs, or spread the word. Together, we can build a tool that truly empowers people to access their own data and reshape the future of AI.
+</details>
+
 ## 🧐 Why Crawl4AI?
 
 1. **Built for LLMs**: Creates smart, concise Markdown optimized for RAG and fine-tuning applications.  
diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index d297dfca..beda64f8 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -2,45 +2,84 @@
 
 from .async_webcrawler import AsyncWebCrawler, CacheMode
 from .async_configs import BrowserConfig, CrawlerRunConfig
-from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
+from .content_scraping_strategy import (
+    ContentScrapingStrategy,
+    WebScrapingStrategy,
+    LXMLWebScrapingStrategy,
+)
+from .extraction_strategy import (
+    ExtractionStrategy,
+    LLMExtractionStrategy,
+    CosineStrategy,
+    JsonCssExtractionStrategy,
+)
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
 from .content_filter_strategy import PruningContentFilter, BM25ContentFilter
-from .models import CrawlResult
-from .__version__ import __version__
+from .models import CrawlResult, MarkdownGenerationResult
+from .async_dispatcher import (
+    MemoryAdaptiveDispatcher,
+    SemaphoreDispatcher,
+    RateLimiter,
+    CrawlerMonitor,
+    DisplayMode,
+    BaseDispatcher
+)
 
 __all__ = [
     "AsyncWebCrawler",
     "CrawlResult",
     "CacheMode",
-    'BrowserConfig',
-    'CrawlerRunConfig',
-    'ExtractionStrategy',
-    'LLMExtractionStrategy',
-    'CosineStrategy',
-    'JsonCssExtractionStrategy',
-    'ChunkingStrategy',
-    'RegexChunking',
-    'DefaultMarkdownGenerator',
-    'PruningContentFilter',
-    'BM25ContentFilter',
+    "ContentScrapingStrategy",
+    "WebScrapingStrategy",
+    "LXMLWebScrapingStrategy",
+    "BrowserConfig",
+    "CrawlerRunConfig",
+    "ExtractionStrategy",
+    "LLMExtractionStrategy",
+    "CosineStrategy",
+    "JsonCssExtractionStrategy",
+    "ChunkingStrategy",
+    "RegexChunking",
+    "DefaultMarkdownGenerator",
+    "PruningContentFilter",
+    "BM25ContentFilter",
+    "BaseDispatcher",
+    "MemoryAdaptiveDispatcher",
+    "SemaphoreDispatcher",
+    "RateLimiter",
+    "CrawlerMonitor",
+    "DisplayMode",
+    "MarkdownGenerationResult",
 ]
 
+
 def is_sync_version_installed():
     try:
         import selenium
+
         return True
     except ImportError:
         return False
 
+
 if is_sync_version_installed():
     try:
         from .web_crawler import WebCrawler
+
         __all__.append("WebCrawler")
     except ImportError:
-        import warnings
-        print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.")
+        print(
+            "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
+        )
 else:
     WebCrawler = None
     # import warnings
-    # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
\ No newline at end of file
+    # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
+
+import warnings
+from pydantic import warnings as pydantic_warnings
+
+# Disable all Pydantic warnings
+warnings.filterwarnings("ignore", module="pydantic")
+# pydantic_warnings.filter_warnings()
\ No newline at end of file
diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index 8ec3d053..ea8194f4 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.4.247"
+__version__ = "0.4.248"
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index a4de071f..f4914726 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -5,13 +5,13 @@ from .config import (
     PAGE_TIMEOUT,
     IMAGE_SCORE_THRESHOLD,
     SOCIAL_MEDIA_DOMAINS,
-
 )
 from .user_agent_generator import UserAgentGenerator
 from .extraction_strategy import ExtractionStrategy
-from .chunking_strategy import ChunkingStrategy
+from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import MarkdownGenerationStrategy
-from typing import Union, List
+from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
+from typing import Optional, Union, List
 
 
 class BrowserConfig:
@@ -38,7 +38,7 @@ class BrowserConfig:
                               is "chromium". Default: "chromium".
         channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
                               is "chromium". Default: "chromium".
-        proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
+        proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
                              Default: None.
         proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                      If None, no additional proxy config. Default: None.
@@ -81,10 +81,10 @@ class BrowserConfig:
         user_data_dir: str = None,
         chrome_channel: str = "chromium",
         channel: str = "chromium",
-        proxy: str = None,
+        proxy: Optional[str] = None,
         proxy_config: dict = None,
         viewport_width: int = 1080,
-        viewport_height: int = 600, 
+        viewport_height: int = 600,
         accept_downloads: bool = False,
         downloads_path: str = None,
         storage_state=None,
@@ -103,7 +103,7 @@ class BrowserConfig:
         text_mode: bool = False,
         light_mode: bool = False,
         extra_args: list = None,
-        debugging_port : int = 9222,
+        debugging_port: int = 9222,
     ):
         self.browser_type = browser_type
         self.headless = headless
@@ -112,6 +112,9 @@ class BrowserConfig:
         self.user_data_dir = user_data_dir
         self.chrome_channel = chrome_channel or self.browser_type or "chromium"
         self.channel = channel or self.browser_type or "chromium"
+        if self.browser_type in ["firefox", "webkit"]:
+            self.channel = ""
+            self.chrome_channel = ""
         self.proxy = proxy
         self.proxy_config = proxy_config
         self.viewport_width = viewport_width
@@ -142,7 +145,7 @@ class BrowserConfig:
             self.user_agent = user_agenr_generator.generate()
         else:
             pass
-        
+
         self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
         self.headers.setdefault("sec-ch-ua", self.browser_hint)
 
@@ -183,6 +186,50 @@ class BrowserConfig:
             extra_args=kwargs.get("extra_args", []),
         )
 
+    def to_dict(self):
+        return {
+            "browser_type": self.browser_type,
+            "headless": self.headless,
+            "use_managed_browser": self.use_managed_browser,
+            "use_persistent_context": self.use_persistent_context,
+            "user_data_dir": self.user_data_dir,
+            "chrome_channel": self.chrome_channel,
+            "channel": self.channel,
+            "proxy": self.proxy,
+            "proxy_config": self.proxy_config,
+            "viewport_width": self.viewport_width,
+            "viewport_height": self.viewport_height,
+            "accept_downloads": self.accept_downloads,
+            "downloads_path": self.downloads_path,
+            "storage_state": self.storage_state,
+            "ignore_https_errors": self.ignore_https_errors,
+            "java_script_enabled": self.java_script_enabled,
+            "cookies": self.cookies,
+            "headers": self.headers,
+            "user_agent": self.user_agent,
+            "user_agent_mode": self.user_agent_mode,
+            "user_agent_generator_config": self.user_agent_generator_config,
+            "text_mode": self.text_mode,
+            "light_mode": self.light_mode,
+            "extra_args": self.extra_args,
+            "sleep_on_close": self.sleep_on_close,
+            "verbose": self.verbose,
+            "debugging_port": self.debugging_port,
+        }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+        
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+            
+        Returns:
+            BrowserConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return BrowserConfig.from_kwargs(config_dict)
+
 
 class CrawlerRunConfig:
     """
@@ -221,6 +268,8 @@ class CrawlerRunConfig:
                           Default: False.
         parser_type (str): Type of parser to use for HTML parsing.
                            Default: "lxml".
+        scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
+                           Default: WebScrapingStrategy.
 
         # Caching Parameters
         cache_mode (CacheMode or None): Defines how caching is handled.
@@ -237,6 +286,8 @@ class CrawlerRunConfig:
                               Default: False.
         no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
                                Default: False.
+        shared_data (dict or None): Shared data to be passed between hooks.
+                                     Default: None.
 
         # Page Navigation and Timing Parameters
         wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
@@ -311,6 +362,14 @@ class CrawlerRunConfig:
                         Default: True.
         log_console (bool): If True, log console messages from the page.
                             Default: False.
+
+        # Streaming Parameters
+        stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
+                      Default: False.
+
+        # Optional Parameters
+        stream (bool): If True, stream the page content as it is being loaded.
+        url: str = None  # This is not a compulsory parameter
     """
 
     def __init__(
@@ -318,7 +377,7 @@ class CrawlerRunConfig:
         # Content Processing Parameters
         word_count_threshold: int = MIN_WORD_THRESHOLD,
         extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
         markdown_generator: MarkdownGenerationStrategy = None,
         content_filter=None,
         only_text: bool = False,
@@ -329,10 +388,9 @@ class CrawlerRunConfig:
         remove_forms: bool = False,
         prettiify: bool = False,
         parser_type: str = "lxml",
-
+        scraping_strategy: ContentScrapingStrategy = None,
         # SSL Parameters
         fetch_ssl_certificate: bool = False,
-
         # Caching Parameters
         cache_mode=None,
         session_id: str = None,
@@ -340,7 +398,7 @@ class CrawlerRunConfig:
         disable_cache: bool = False,
         no_cache_read: bool = False,
         no_cache_write: bool = False,
-
+        shared_data: dict = None,
         # Page Navigation and Timing Parameters
         wait_until: str = "domcontentloaded",
         page_timeout: int = PAGE_TIMEOUT,
@@ -350,7 +408,6 @@ class CrawlerRunConfig:
         mean_delay: float = 0.1,
         max_range: float = 0.3,
         semaphore_count: int = 5,
-
         # Page Interaction Parameters
         js_code: Union[str, List[str]] = None,
         js_only: bool = False,
@@ -363,7 +420,6 @@ class CrawlerRunConfig:
         override_navigator: bool = False,
         magic: bool = False,
         adjust_viewport_to_content: bool = False,
-
         # Media Handling Parameters
         screenshot: bool = False,
         screenshot_wait_for: float = None,
@@ -372,21 +428,20 @@ class CrawlerRunConfig:
         image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
         image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
         exclude_external_images: bool = False,
-
         # Link and Domain Handling Parameters
         exclude_social_media_domains: list = None,
         exclude_external_links: bool = False,
         exclude_social_media_links: bool = False,
         exclude_domains: list = None,
-
         # Debugging and Logging Parameters
         verbose: bool = True,
         log_console: bool = False,
-        
+        # Streaming Parameters
+        stream: bool = False,
         url: str = None,
     ):
         self.url = url
-        
+
         # Content Processing Parameters
         self.word_count_threshold = word_count_threshold
         self.extraction_strategy = extraction_strategy
@@ -401,6 +456,7 @@ class CrawlerRunConfig:
         self.remove_forms = remove_forms
         self.prettiify = prettiify
         self.parser_type = parser_type
+        self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
 
         # SSL Parameters
         self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -412,6 +468,7 @@ class CrawlerRunConfig:
         self.disable_cache = disable_cache
         self.no_cache_read = no_cache_read
         self.no_cache_write = no_cache_write
+        self.shared_data = shared_data
 
         # Page Navigation and Timing Parameters
         self.wait_until = wait_until
@@ -446,7 +503,9 @@ class CrawlerRunConfig:
         self.exclude_external_images = exclude_external_images
 
         # Link and Domain Handling Parameters
-        self.exclude_social_media_domains = exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
+        self.exclude_social_media_domains = (
+            exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
+        )
         self.exclude_external_links = exclude_external_links
         self.exclude_social_media_links = exclude_social_media_links
         self.exclude_domains = exclude_domains or []
@@ -455,19 +514,25 @@ class CrawlerRunConfig:
         self.verbose = verbose
         self.log_console = log_console
 
+        # Streaming Parameters
+        self.stream = stream
+
         # Validate type of extraction strategy and chunking strategy if they are provided
         if self.extraction_strategy is not None and not isinstance(
             self.extraction_strategy, ExtractionStrategy
         ):
-            raise ValueError("extraction_strategy must be an instance of ExtractionStrategy")
+            raise ValueError(
+                "extraction_strategy must be an instance of ExtractionStrategy"
+            )
         if self.chunking_strategy is not None and not isinstance(
             self.chunking_strategy, ChunkingStrategy
         ):
-            raise ValueError("chunking_strategy must be an instance of ChunkingStrategy")
+            raise ValueError(
+                "chunking_strategy must be an instance of ChunkingStrategy"
+            )
 
         # Set default chunking strategy if None
         if self.chunking_strategy is None:
-            from .chunking_strategy import RegexChunking
             self.chunking_strategy = RegexChunking()
 
     @staticmethod
@@ -476,7 +541,7 @@ class CrawlerRunConfig:
             # Content Processing Parameters
             word_count_threshold=kwargs.get("word_count_threshold", 200),
             extraction_strategy=kwargs.get("extraction_strategy"),
-            chunking_strategy=kwargs.get("chunking_strategy"),
+            chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
             markdown_generator=kwargs.get("markdown_generator"),
             content_filter=kwargs.get("content_filter"),
             only_text=kwargs.get("only_text", False),
@@ -487,10 +552,9 @@ class CrawlerRunConfig:
             remove_forms=kwargs.get("remove_forms", False),
             prettiify=kwargs.get("prettiify", False),
             parser_type=kwargs.get("parser_type", "lxml"),
-
+            scraping_strategy=kwargs.get("scraping_strategy"),
             # SSL Parameters
             fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
-
             # Caching Parameters
             cache_mode=kwargs.get("cache_mode"),
             session_id=kwargs.get("session_id"),
@@ -498,7 +562,7 @@ class CrawlerRunConfig:
             disable_cache=kwargs.get("disable_cache", False),
             no_cache_read=kwargs.get("no_cache_read", False),
             no_cache_write=kwargs.get("no_cache_write", False),
-
+            shared_data=kwargs.get("shared_data", None),
             # Page Navigation and Timing Parameters
             wait_until=kwargs.get("wait_until", "domcontentloaded"),
             page_timeout=kwargs.get("page_timeout", 60000),
@@ -508,7 +572,6 @@ class CrawlerRunConfig:
             mean_delay=kwargs.get("mean_delay", 0.1),
             max_range=kwargs.get("max_range", 0.3),
             semaphore_count=kwargs.get("semaphore_count", 5),
-
             # Page Interaction Parameters
             js_code=kwargs.get("js_code"),
             js_only=kwargs.get("js_only", False),
@@ -521,29 +584,36 @@ class CrawlerRunConfig:
             override_navigator=kwargs.get("override_navigator", False),
             magic=kwargs.get("magic", False),
             adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
-
             # Media Handling Parameters
             screenshot=kwargs.get("screenshot", False),
             screenshot_wait_for=kwargs.get("screenshot_wait_for"),
-            screenshot_height_threshold=kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD),
+            screenshot_height_threshold=kwargs.get(
+                "screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD
+            ),
             pdf=kwargs.get("pdf", False),
-            image_description_min_word_threshold=kwargs.get("image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD),
-            image_score_threshold=kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD),
+            image_description_min_word_threshold=kwargs.get(
+                "image_description_min_word_threshold",
+                IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+            ),
+            image_score_threshold=kwargs.get(
+                "image_score_threshold", IMAGE_SCORE_THRESHOLD
+            ),
             exclude_external_images=kwargs.get("exclude_external_images", False),
-
             # Link and Domain Handling Parameters
-            exclude_social_media_domains=kwargs.get("exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS),
+            exclude_social_media_domains=kwargs.get(
+                "exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS
+            ),
             exclude_external_links=kwargs.get("exclude_external_links", False),
             exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
             exclude_domains=kwargs.get("exclude_domains", []),
-
             # Debugging and Logging Parameters
             verbose=kwargs.get("verbose", True),
             log_console=kwargs.get("log_console", False),
-            
+            # Streaming Parameters
+            stream=kwargs.get("stream", False),
             url=kwargs.get("url"),
         )
-        
+
     # Create a funciton returns dict of the object
     def to_dict(self):
         return {
@@ -560,6 +630,7 @@ class CrawlerRunConfig:
             "remove_forms": self.remove_forms,
             "prettiify": self.prettiify,
             "parser_type": self.parser_type,
+            "scraping_strategy": self.scraping_strategy,
             "fetch_ssl_certificate": self.fetch_ssl_certificate,
             "cache_mode": self.cache_mode,
             "session_id": self.session_id,
@@ -567,6 +638,7 @@ class CrawlerRunConfig:
             "disable_cache": self.disable_cache,
             "no_cache_read": self.no_cache_read,
             "no_cache_write": self.no_cache_write,
+            "shared_data": self.shared_data,
             "wait_until": self.wait_until,
             "page_timeout": self.page_timeout,
             "wait_for": self.wait_for,
@@ -599,5 +671,32 @@ class CrawlerRunConfig:
             "exclude_domains": self.exclude_domains,
             "verbose": self.verbose,
             "log_console": self.log_console,
+            "stream": self.stream,
             "url": self.url,
         }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+        
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+            
+        Returns:
+            CrawlerRunConfig: A new instance with the specified updates
+            
+        Example:
+            ```python
+            # Create a new config with streaming enabled
+            stream_config = config.clone(stream=True)
+            
+            # Create a new config with multiple updates
+            new_config = config.clone(
+                stream=True,
+                cache_mode=CacheMode.BYPASS,
+                verbose=True
+            )
+            ```
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return CrawlerRunConfig.from_kwargs(config_dict)
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index b879413c..786d2fb9 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -2,28 +2,27 @@ import asyncio
 import base64
 import time
 from abc import ABC, abstractmethod
-from typing import Callable, Dict, Any, List, Optional, Awaitable, Union
-import os, sys, shutil
-import tempfile, subprocess
-from playwright.async_api import async_playwright, Page, Browser, Error, BrowserContext
+from typing import Callable, Dict, Any, List, Optional, Union
+import os
+import sys
+import shutil
+import tempfile
+import subprocess
+from playwright.async_api import Page, Error, BrowserContext
 from playwright.async_api import TimeoutError as PlaywrightTimeoutError
 from io import BytesIO
 from PIL import Image, ImageDraw, ImageFont
-from pathlib import Path
-from playwright.async_api import ProxySettings
-from pydantic import BaseModel
 import hashlib
-import json
 import uuid
 from .js_snippet import load_js_script
 from .models import AsyncCrawlResponse
-from .utils import get_error_context
 from .user_agent_generator import UserAgentGenerator
 from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT
 from .async_configs import BrowserConfig, CrawlerRunConfig
 from .async_logger import AsyncLogger
-from playwright_stealth import StealthConfig, stealth_async
+from playwright_stealth import StealthConfig
 from .ssl_certificate import SSLCertificate
+from .utils import get_home_folder, get_chromium_path
 
 stealth_config = StealthConfig(
     webdriver=True,
@@ -66,7 +65,7 @@ BROWSER_DISABLE_OPTIONS = [
 class ManagedBrowser:
     """
     Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
-    
+
     Attributes:
         browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
                             Default: "chromium".
@@ -75,16 +74,16 @@ class ManagedBrowser:
         headless (bool): Whether to run the browser in headless mode (no visible GUI).
                          Default: True.
         browser_process (subprocess.Popen): The process object for the browser.
-        temp_dir (str): Temporary directory for user data if not provided.  
+        temp_dir (str): Temporary directory for user data if not provided.
         debugging_port (int): Port for debugging the browser.
         host (str): Host for debugging the browser.
-        
+
         Methods:
             start(): Starts the browser process and returns the CDP endpoint URL.
             _get_browser_path(): Returns the browser executable path based on OS and browser type.
             _get_browser_args(): Returns browser-specific command line arguments.
             _get_user_data_dir(): Returns the user data directory path.
-            _cleanup(): Terminates the browser process and removes the temporary directory. 
+            _cleanup(): Terminates the browser process and removes the temporary directory.
     """
 
     browser_type: str
@@ -94,6 +93,7 @@ class ManagedBrowser:
     temp_dir: str
     debugging_port: int
     host: str
+
     def __init__(
         self,
         browser_type: str = "chromium",
@@ -105,7 +105,7 @@ class ManagedBrowser:
     ):
         """
         Initialize the ManagedBrowser instance.
-        
+
         Args:
             browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
                                 Default: "chromium".
@@ -116,7 +116,7 @@ class ManagedBrowser:
             logger (logging.Logger): Logger instance for logging messages. Default: None.
             host (str): Host for debugging the browser. Default: "localhost".
             debugging_port (int): Port for debugging the browser. Default: 9222.
-        """ 
+        """
         self.browser_type = browser_type
         self.user_data_dir = user_data_dir
         self.headless = headless
@@ -139,8 +139,8 @@ class ManagedBrowser:
             self.user_data_dir = self.temp_dir
 
         # Get browser path and args based on OS and browser type
-        browser_path = self._get_browser_path()
-        args = self._get_browser_args()
+        # browser_path = self._get_browser_path()
+        args = await self._get_browser_args()
 
         # Start browser process
         try:
@@ -158,13 +158,13 @@ class ManagedBrowser:
     async def _monitor_browser_process(self):
         """
         Monitor the browser process for unexpected termination.
-        
+
         How it works:
         1. Read stdout and stderr from the browser process.
         2. If the process has terminated, log the error message and terminate the browser.
         3. If the shutting_down flag is set, log the normal termination message.
         4. If any other error occurs, log the error message.
-        
+
         Note: This method should be called in a separate task to avoid blocking the main event loop.
         """
         if self.browser_process:
@@ -201,7 +201,7 @@ class ManagedBrowser:
                         params={"error": str(e)},
                     )
 
-    def _get_browser_path(self) -> str:
+    def _get_browser_path_WIP(self) -> str:
         """Returns the browser executable path based on OS and browser type"""
         if sys.platform == "darwin":  # macOS
             paths = {
@@ -224,9 +224,13 @@ class ManagedBrowser:
 
         return paths.get(self.browser_type)
 
-    def _get_browser_args(self) -> List[str]:
+    async def _get_browser_path(self) -> str:
+        browser_path = await get_chromium_path(self.browser_type)
+        return browser_path
+
+    async def _get_browser_args(self) -> List[str]:
         """Returns browser-specific command line arguments"""
-        base_args = [self._get_browser_path()]
+        base_args = [await self._get_browser_path()]
 
         if self.browser_type == "chromium":
             args = [
@@ -289,17 +293,18 @@ class ManagedBrowser:
 class BrowserManager:
     """
     Manages the browser instance and context.
-    
-    Attributes: 
+
+    Attributes:
         config (BrowserConfig): Configuration object containing all browser settings
         logger: Logger instance for recording events and errors
         browser (Browser): The browser instance
-        default_context (BrowserContext): The default browser context    
+        default_context (BrowserContext): The default browser context
         managed_browser (ManagedBrowser): The managed browser instance
         playwright (Playwright): The Playwright instance
         sessions (dict): Dictionary to store session information
         session_ttl (int): Session timeout in seconds
     """
+
     def __init__(self, browser_config: BrowserConfig, logger=None):
         """
         Initialize the BrowserManager with a browser configuration.
@@ -321,6 +326,10 @@ class BrowserManager:
         self.sessions = {}
         self.session_ttl = 1800  # 30 minutes
 
+        # Keep track of contexts by a "config signature," so each unique config reuses a single context
+        self.contexts_by_config = {}
+        self._contexts_lock = asyncio.Lock() 
+
         # Initialize ManagedBrowser if needed
         if self.config.use_managed_browser:
             self.managed_browser = ManagedBrowser(
@@ -334,13 +343,13 @@ class BrowserManager:
     async def start(self):
         """
         Start the browser instance and set up the default context.
-        
+
         How it works:
         1. Check if Playwright is already initialized.
         2. If not, initialize Playwright.
         3. If managed browser is used, start it and connect to the CDP endpoint.
         4. If managed browser is not used, launch the browser and set up the default context.
-        
+
         Note: This method should be called in a separate task to avoid blocking the main event loop.
         """
         if self.playwright is None:
@@ -456,7 +465,7 @@ class BrowserManager:
     async def setup_context(
         self,
         context: BrowserContext,
-        crawlerRunConfig: CrawlerRunConfig,
+        crawlerRunConfig: CrawlerRunConfig = None,
         is_default=False,
     ):
         """
@@ -479,11 +488,11 @@ class BrowserManager:
         14. Set default timeouts for navigation and download if enabled.
         15. Set user agent if provided.
         16. Set browser hints if provided.
-        
+
         Args:
             context (BrowserContext): The browser context to set up
             crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
-            is_default (bool): Flag indicating if this is the default context        
+            is_default (bool): Flag indicating if this is the default context
         Returns:
             None
         """
@@ -501,9 +510,9 @@ class BrowserManager:
             context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
             if self.config.downloads_path:
                 context._impl_obj._options["accept_downloads"] = True
-                context._impl_obj._options["downloads_path"] = (
-                    self.config.downloads_path
-                )
+                context._impl_obj._options[
+                    "downloads_path"
+                ] = self.config.downloads_path
 
         # Handle user agent and browser hints
         if self.config.user_agent:
@@ -516,22 +525,31 @@ class BrowserManager:
 
         # Add default cookie
         await context.add_cookies(
-            [{"name": "cookiesEnabled", "value": "true", "url": crawlerRunConfig.url}]
+            [
+                {
+                    "name": "cookiesEnabled",
+                    "value": "true",
+                    "url": crawlerRunConfig.url
+                    if crawlerRunConfig
+                    else "https://crawl4ai.com/",
+                }
+            ]
         )
 
         # Handle navigator overrides
-        if (
-            crawlerRunConfig.override_navigator
-            or crawlerRunConfig.simulate_user
-            or crawlerRunConfig.magic
-        ):
-            await context.add_init_script(load_js_script("navigator_overrider"))
+        if crawlerRunConfig:
+            if (
+                crawlerRunConfig.override_navigator
+                or crawlerRunConfig.simulate_user
+                or crawlerRunConfig.magic
+            ):
+                await context.add_init_script(load_js_script("navigator_overrider"))
 
     async def create_browser_context(self):
         """
         Creates and returns a new browser context with configured settings.
         Applies text-only mode settings if text_mode is enabled in config.
-        
+
         Returns:
             Context: Browser context object with the specified configurations
         """
@@ -542,25 +560,62 @@ class BrowserManager:
             "height": self.config.viewport_height,
         }
         proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
-        
+
         blocked_extensions = [
             # Images
-            'jpg', 'jpeg', 'png', 'gif', 'webp', 'svg', 'ico', 'bmp', 'tiff', 'psd',
+            "jpg",
+            "jpeg",
+            "png",
+            "gif",
+            "webp",
+            "svg",
+            "ico",
+            "bmp",
+            "tiff",
+            "psd",
             # Fonts
-            'woff', 'woff2', 'ttf', 'otf', 'eot',
+            "woff",
+            "woff2",
+            "ttf",
+            "otf",
+            "eot",
             # Styles
             # 'css', 'less', 'scss', 'sass',
             # Media
-            'mp4', 'webm', 'ogg', 'avi', 'mov', 'wmv', 'flv', 'm4v',
-            'mp3', 'wav', 'aac', 'm4a', 'opus', 'flac',
+            "mp4",
+            "webm",
+            "ogg",
+            "avi",
+            "mov",
+            "wmv",
+            "flv",
+            "m4v",
+            "mp3",
+            "wav",
+            "aac",
+            "m4a",
+            "opus",
+            "flac",
             # Documents
-            'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx',
+            "pdf",
+            "doc",
+            "docx",
+            "xls",
+            "xlsx",
+            "ppt",
+            "pptx",
             # Archives
-            'zip', 'rar', '7z', 'tar', 'gz',
+            "zip",
+            "rar",
+            "7z",
+            "tar",
+            "gz",
             # Scripts and data
-            'xml', 'swf', 'wasm'
+            "xml",
+            "swf",
+            "wasm",
         ]
-        
+
         # Common context settings
         context_settings = {
             "user_agent": user_agent,
@@ -572,7 +627,7 @@ class BrowserManager:
             "device_scale_factor": 1.0,
             "java_script_enabled": self.config.java_script_enabled,
         }
-        
+
         if self.config.text_mode:
             text_mode_settings = {
                 "has_touch": False,
@@ -580,44 +635,89 @@ class BrowserManager:
             }
             # Update context settings with text mode settings
             context_settings.update(text_mode_settings)
-            
+
         # Create and return the context with all settings
         context = await self.browser.new_context(**context_settings)
-        
+
         # Apply text mode settings if enabled
         if self.config.text_mode:
             # Create and apply route patterns for each extension
             for ext in blocked_extensions:
                 await context.route(f"**/*.{ext}", lambda route: route.abort())
         return context
-    
-    # async def get_page(self, session_id: Optional[str], user_agent: str):
+
+    def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
+        """
+        Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
+        then returns a hash of the sorted JSON. This yields a stable signature
+        that identifies configurations requiring a unique browser context.
+        """
+        import json, hashlib
+
+        config_dict = crawlerRunConfig.__dict__.copy()
+        # Exclude items that do not affect browser-level setup.
+        # Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
+        ephemeral_keys = [
+            "session_id",
+            "js_code",
+            "scraping_strategy",
+            "extraction_strategy",
+            "chunking_strategy",
+            "cache_mode",
+            "content_filter",
+            "semaphore_count",
+            "url"
+        ]
+        for key in ephemeral_keys:
+            if key in config_dict:
+                del config_dict[key]
+        # Convert to canonical JSON string
+        signature_json = json.dumps(config_dict, sort_keys=True, default=str)
+
+        # Hash the JSON so we get a compact, unique string
+        signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
+        return signature_hash
+
     async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
         """
         Get a page for the given session ID, creating a new one if needed.
-        
+
         Args:
             crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
 
         Returns:
-            Page: The page object for the given session ID.
-            BrowserContext: The browser context for the given session ID.
+            (page, context): The Page and its BrowserContext
         """
         self._cleanup_expired_sessions()
 
+        # If a session_id is provided and we already have it, reuse that page + context
         if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
             context, page, _ = self.sessions[crawlerRunConfig.session_id]
+            # Update last-used timestamp
             self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
             return page, context
 
+        # If using a managed browser, just grab the shared default_context
         if self.config.use_managed_browser:
             context = self.default_context
             page = await context.new_page()
         else:
-            context = await self.create_browser_context()
-            await self.setup_context(context, crawlerRunConfig)
+            # Otherwise, check if we have an existing context for this config
+            config_signature = self._make_config_signature(crawlerRunConfig)
+
+            async with self._contexts_lock:
+                if config_signature in self.contexts_by_config:
+                    context = self.contexts_by_config[config_signature]
+                else:
+                    # Create and setup a new context
+                    context = await self.create_browser_context()
+                    await self.setup_context(context, crawlerRunConfig)
+                    self.contexts_by_config[config_signature] = context
+
+            # Create a new page from the chosen context
             page = await context.new_page()
 
+        # If a session_id is specified, store this session so we can reuse later
         if crawlerRunConfig.session_id:
             self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
 
@@ -625,8 +725,8 @@ class BrowserManager:
 
     async def kill_session(self, session_id: str):
         """
-        Kill a browser session and clean up resources.  
-        
+        Kill a browser session and clean up resources.
+
         Args:
             session_id (str): The session ID to kill.
         """
@@ -657,6 +757,18 @@ class BrowserManager:
         for session_id in session_ids:
             await self.kill_session(session_id)
 
+        # Now close all contexts we created. This reclaims memory from ephemeral contexts.
+        for ctx in self.contexts_by_config.values():
+            try:
+                await ctx.close()
+            except Exception as e:
+                self.logger.error(
+                    message="Error closing context: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)}
+                )
+        self.contexts_by_config.clear()
+
         if self.browser:
             await self.browser.close()
             self.browser = None
@@ -676,20 +788,20 @@ class AsyncCrawlerStrategy(ABC):
     Abstract base class for crawler strategies.
     Subclasses must implement the crawl method.
     """
+
     @abstractmethod
     async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
         pass  # 4 + 3
 
 
-
 class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
     """
     Crawler strategy using Playwright.
-    
+
     Attributes:
         browser_config (BrowserConfig): Configuration object containing browser settings.
         logger (AsyncLogger): Logger instance for recording events and errors.
-        _downloaded_files (List[str]): List of downloaded file paths.   
+        _downloaded_files (List[str]): List of downloaded file paths.
         hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior.
         browser_manager (BrowserManager): Manager for browser creation and management.
 
@@ -708,8 +820,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 Kill a browser session and clean up resources.
             crawl(self, url, **kwargs):
                 Run the crawler for a single URL.
-            
+
     """
+
     def __init__(
         self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, **kwargs
     ):
@@ -773,10 +886,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
     async def kill_session(self, session_id: str):
         """
         Kill a browser session and clean up resources.
-        
+
         Args:
             session_id (str): The ID of the session to kill.
-            
+
         Returns:
             None
         """
@@ -791,20 +904,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         """
         Set a hook function for a specific hook type. Following are list of hook types:
         - on_browser_created: Called when a new browser instance is created.
-        - on_page_context_created: Called when a new page context is created.    
-        - on_user_agent_updated: Called when the user agent is updated.    
-        - on_execution_started: Called when the execution starts.    
-        - before_goto: Called before a goto operation.    
-        - after_goto: Called after a goto operation.    
-        - before_return_html: Called before returning HTML content.    
-        - before_retrieve_html: Called before retrieving HTML content.  
-        
+        - on_page_context_created: Called when a new page context is created.
+        - on_user_agent_updated: Called when the user agent is updated.
+        - on_execution_started: Called when the execution starts.
+        - before_goto: Called before a goto operation.
+        - after_goto: Called after a goto operation.
+        - before_return_html: Called before returning HTML content.
+        - before_retrieve_html: Called before retrieving HTML content.
+
         All hooks except on_browser_created accepts a context and a page as arguments and **kwargs. However, on_browser_created accepts a browser and a context as arguments and **kwargs.
-        
+
         Args:
             hook_type (str): The type of the hook.
             hook (Callable): The hook function to set.
-            
+
         Returns:
             None
         """
@@ -816,12 +929,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
     async def execute_hook(self, hook_type: str, *args, **kwargs):
         """
         Execute a hook function for a specific hook type.
-        
+
         Args:
             hook_type (str): The type of the hook.
             *args: Variable length positional arguments.
             **kwargs: Keyword arguments.
-            
+
         Returns:
             The return value of the hook function, if any.
         """
@@ -836,42 +949,42 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
     def update_user_agent(self, user_agent: str):
         """
         Update the user agent for the browser.
-        
+
         Args:
             user_agent (str): The new user agent string.
-            
+
         Returns:
             None
         """
         self.user_agent = user_agent
 
     def set_custom_headers(self, headers: Dict[str, str]):
-        """ 
-        Set custom headers for the browser. 
-        
+        """
+        Set custom headers for the browser.
+
         Args:
             headers (Dict[str, str]): A dictionary of headers to set.
-            
+
         Returns:
             None
         """
         self.headers = headers
 
     async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
-        """ 
+        """
         Wait for a condition in a smart way. This functions works as below:
-        
+
         1. If wait_for starts with 'js:', it assumes it's a JavaScript function and waits for it to return true.
         2. If wait_for starts with 'css:', it assumes it's a CSS selector and waits for it to be present.
         3. Otherwise, it tries to evaluate wait_for as a JavaScript function and waits for it to return true.
         4. If it's not a JavaScript function, it assumes it's a CSS selector and waits for it to be present.
-        
-        This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl().        
+
+        This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl().
         Args:
             page: Playwright page object
             wait_for (str): The condition to wait for. Can be a CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'.
             timeout (float): Maximum time to wait in milliseconds
-            
+
         Returns:
             None
         """
@@ -921,18 +1034,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                                 "or explicitly prefixed with 'js:' or 'css:'."
                             )
 
-    async def csp_compliant_wait( self, page: Page, user_wait_function: str, timeout: float = 30000 ):
+    async def csp_compliant_wait(
+        self, page: Page, user_wait_function: str, timeout: float = 30000
+    ):
         """
         Wait for a condition in a CSP-compliant way.
-        
+
         Args:
             page: Playwright page object
             user_wait_function: JavaScript function as string that returns boolean
             timeout: Maximum time to wait in milliseconds
-            
+
         Returns:
             bool: True if condition was met, False if timed out
-            
+
         Raises:
             RuntimeError: If there's an error evaluating the condition
         """
@@ -968,10 +1083,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
     async def process_iframes(self, page):
         """
         Process iframes on a page. This function will extract the content of each iframe and replace it with a div containing the extracted content.
-        
+
         Args:
             page: Playwright page object
-            
+
         Returns:
             Playwright page object
         """
@@ -1033,10 +1148,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         """
         Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls.
         This function is asynchronous and returns a string representing the session ID.
-        
+
         Args:
             **kwargs: Optional keyword arguments to configure the session.
-        
+
         Returns:
             str: The session ID.
         """
@@ -1049,7 +1164,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         page, context = await self.browser_manager.get_page(session_id, user_agent)
         return session_id
 
-    async def crawl( self, url: str, config: CrawlerRunConfig, **kwargs ) -> AsyncCrawlResponse:
+    async def crawl(
+        self, url: str, config: CrawlerRunConfig, **kwargs
+    ) -> AsyncCrawlResponse:
         """
         Crawls a given URL or processes raw HTML/local file content based on the URL prefix.
 
@@ -1108,7 +1225,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 "URL must start with 'http://', 'https://', 'file://', or 'raw:'"
             )
 
-    async def _crawl_web( self, url: str, config: CrawlerRunConfig ) -> AsyncCrawlResponse:
+    async def _crawl_web(
+        self, url: str, config: CrawlerRunConfig
+    ) -> AsyncCrawlResponse:
         """
         Internal method to crawl web URLs with the specified configuration.
 
@@ -1122,6 +1241,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         config.url = url
         response_headers = {}
         status_code = None
+        final_url = url 
 
         # Reset downloaded files list for new crawl
         self._downloaded_files = []
@@ -1146,7 +1266,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             await context.add_init_script(load_js_script("navigator_overrider"))
 
         # Call hook after page creation
-        await self.execute_hook("on_page_context_created", page, context=context)
+        await self.execute_hook("on_page_context_created", page, context=context, config=config)
 
         # Set up console logging if requested
         if config.log_console:
@@ -1187,24 +1307,29 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
             # Handle page navigation and content loading
             if not config.js_only:
-                await self.execute_hook("before_goto", page, context=context, url=url)
+                await self.execute_hook("before_goto", page, context=context, url=url, config=config)
 
                 try:
                     # Generate a unique nonce for this request
                     nonce = hashlib.sha256(os.urandom(32)).hexdigest()
-                    
+
                     # Add CSP headers to the request
-                    await page.set_extra_http_headers({
-                        'Content-Security-Policy': f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
-                    })
+                    await page.set_extra_http_headers(
+                        {
+                            "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
+                        }
+                    )
 
                     response = await page.goto(
                         url, wait_until=config.wait_until, timeout=config.page_timeout
                     )
+                    final_url = page.url
                 except Error as e:
                     raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
 
-                await self.execute_hook("after_goto", page, context=context, url=url, response=response)
+                await self.execute_hook(
+                    "after_goto", page, context=context, url=url, response=response, config=config
+                )
 
                 if response is None:
                     status_code = 200
@@ -1220,7 +1345,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             # Wait for body element and visibility
             try:
                 await page.wait_for_selector("body", state="attached", timeout=30000)
-                
+
                 # Use the new check_visibility function with csp_compliant_wait
                 is_visible = await self.csp_compliant_wait(
                     page,
@@ -1233,16 +1358,16 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                                         style.opacity !== '0';
                         return isVisible;
                     }""",
-                    timeout=30000
+                    timeout=30000,
                 )
-                
+
                 if not is_visible and not config.ignore_body_visibility:
                     visibility_info = await self.check_visibility(page)
                     raise Error(f"Body element is hidden: {visibility_info}")
 
-            except Error as e:
+            except Error:
                 visibility_info = await self.check_visibility(page)
-                
+
                 if self.config.verbose:
                     self.logger.debug(
                         message="Body visibility info: {info}",
@@ -1251,19 +1376,18 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     )
 
                 if not config.ignore_body_visibility:
-                    raise Error(f"Body element is hidden: {visibility_info}")            
-            
-            
+                    raise Error(f"Body element is hidden: {visibility_info}")
+
             # try:
             #     await page.wait_for_selector("body", state="attached", timeout=30000)
-                
+
             #     await page.wait_for_function(
             #         """
             #         () => {
             #             const body = document.body;
             #             const style = window.getComputedStyle(body);
-            #             return style.display !== 'none' && 
-            #                 style.visibility !== 'hidden' && 
+            #             return style.display !== 'none' &&
+            #                 style.visibility !== 'hidden' &&
             #                 style.opacity !== '0';
             #         }
             #     """,
@@ -1302,14 +1426,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             ):
                 await page.wait_for_load_state("domcontentloaded")
                 await asyncio.sleep(0.1)
-                
+
                 # Check for image loading with improved error handling
                 images_loaded = await self.csp_compliant_wait(
                     page,
                     "() => Array.from(document.getElementsByTagName('img')).every(img => img.complete)",
-                    timeout=1000
+                    timeout=1000,
                 )
-                
+
                 if not images_loaded and self.logger:
                     self.logger.warning(
                         message="Some images failed to load within timeout",
@@ -1320,8 +1444,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             if not self.browser_config.text_mode and config.adjust_viewport_to_content:
                 try:
                     dimensions = await self.get_page_dimensions(page)
-                    page_height = dimensions['height']
-                    page_width = dimensions['width']                    
+                    page_height = dimensions["height"]
+                    page_width = dimensions["width"]
                     # page_width = await page.evaluate(
                     #     "document.documentElement.scrollWidth"
                     # )
@@ -1365,18 +1489,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             #     elif isinstance(config.js_code, list):
             #         for js in config.js_code:
             #             await page.evaluate(js)
-                        
+
             if config.js_code:
                 # execution_result = await self.execute_user_script(page, config.js_code)
-                execution_result = await self.robust_execute_user_script(page, config.js_code)
+                execution_result = await self.robust_execute_user_script(
+                    page, config.js_code
+                )
                 if not execution_result["success"]:
                     self.logger.warning(
                         message="User script execution had issues: {error}",
                         tag="JS_EXEC",
-                        params={"error": execution_result.get("error")}
-                    )                        
+                        params={"error": execution_result.get("error")},
+                    )
 
-                await self.execute_hook("on_execution_started", page, context=context)
+                await self.execute_hook("on_execution_started", page, context=context, config=config)
 
             # Handle user simulation
             if config.simulate_user or config.magic:
@@ -1386,6 +1512,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 await page.keyboard.press("ArrowDown")
 
             # Handle wait_for condition
+            # Todo: Decide how to handle this
+            if not config.wait_for and config.css_selector and False:
+                config.wait_for = f"css:{config.css_selector}"
+
             if config.wait_for:
                 try:
                     await self.smart_wait(
@@ -1415,7 +1545,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 page = await self.process_iframes(page)
 
             # Pre-content retrieval hooks and delay
-            await self.execute_hook("before_retrieve_html", page, context=context)
+            await self.execute_hook("before_retrieve_html", page, context=context, config=config)
             if config.delay_before_return_html:
                 await asyncio.sleep(config.delay_before_return_html)
 
@@ -1425,7 +1555,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
             # Get final HTML content
             html = await page.content()
-            await self.execute_hook("before_return_html", page = page, html = html, context=context)
+            await self.execute_hook(
+                "before_return_html", page=page, html=html, context=context, config=config
+            )
 
             # Handle PDF and screenshot generation
             start_export_time = time.perf_counter()
@@ -1471,11 +1603,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 downloaded_files=(
                     self._downloaded_files if self._downloaded_files else None
                 ),
+                final_url=final_url,
             )
 
         except Exception as e:
             raise e
-        
+
         finally:
             # If no session_id is given we should close the page
             if not config.session_id:
@@ -1483,20 +1616,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
     async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
         """
-        Helper method to handle full page scanning. 
-        
+        Helper method to handle full page scanning.
+
         How it works:
         1. Get the viewport height.
         2. Scroll to the bottom of the page.
         3. Get the total height of the page.
         4. Scroll back to the top of the page.
-        5. Scroll to the bottom of the page again.  
+        5. Scroll to the bottom of the page again.
         6. Continue scrolling until the bottom of the page is reached.
-        
+
         Args:
             page (Page): The Playwright page object
             scroll_delay (float): The delay between page scrolls
-        
+
         """
         try:
             viewport_height = page.viewport_size.get(
@@ -1511,8 +1644,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
             # total_height = await page.evaluate("document.documentElement.scrollHeight")
             dimensions = await self.get_page_dimensions(page)
-            total_height = dimensions['height']
-            
+            total_height = dimensions["height"]
+
             while current_position < total_height:
                 current_position = min(current_position + viewport_height, total_height)
                 await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
@@ -1521,8 +1654,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
                 # new_height = await page.evaluate("document.documentElement.scrollHeight")
                 dimensions = await self.get_page_dimensions(page)
-                new_height = dimensions['height']
-                
+                new_height = dimensions["height"]
+
                 if new_height > total_height:
                     total_height = new_height
 
@@ -1542,7 +1675,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
     async def _handle_download(self, download):
         """
         Handle file downloads.
-        
+
         How it works:
         1. Get the suggested filename.
         2. Get the download path.
@@ -1550,10 +1683,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         4. Start the download.
         5. Save the downloaded file.
         6. Log the completion.
-        
+
         Args:
             download (Download): The Playwright download object
-            
+
         Returns:
             None
         """
@@ -1598,7 +1731,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         remove_overlays_js = load_js_script("remove_overlay_elements")
 
         try:
-            await page.evaluate(f"""
+            await page.evaluate(
+                f"""
                 (() => {{
                     try {{
                         {remove_overlays_js}
@@ -1611,7 +1745,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                         }};
                     }}
                 }})()
-            """)
+            """
+            )
             await page.wait_for_timeout(500)  # Wait for any animations to complete
         except Exception as e:
             self.logger.warning(
@@ -1623,10 +1758,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
     async def export_pdf(self, page: Page) -> bytes:
         """
         Exports the current page as a PDF.
-        
+
         Args:
             page (Page): The Playwright page object
-            
+
         Returns:
             bytes: The PDF data
         """
@@ -1636,16 +1771,16 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
     async def take_screenshot(self, page, **kwargs) -> str:
         """
         Take a screenshot of the current page.
-        
+
         Args:
             page (Page): The Playwright page object
             kwargs: Additional keyword arguments
-        
+
         Returns:
             str: The base64-encoded screenshot data
         """
         need_scroll = await self.page_need_scroll(page)
-        
+
         if not need_scroll:
             # Page is short enough, just take a screenshot
             return await self.take_screenshot_naive(page)
@@ -1656,13 +1791,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
     async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str:
         """
-        Convert the first page of the PDF to a screenshot.     
-        
+        Convert the first page of the PDF to a screenshot.
+
         Requires pdf2image and poppler.
-        
+
         Args:
             pdf_data (bytes): The PDF data
-        
+
         Returns:
             str: The base64-encoded screenshot data
         """
@@ -1694,21 +1829,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         """
         Attempt to set a large viewport and take a full-page screenshot.
         If still too large, segment the page as before.
-        
+
         Requires pdf2image and poppler.
-        
+
         Args:
             page (Page): The Playwright page object
             kwargs: Additional keyword arguments
-            
+
         Returns:
             str: The base64-encoded screenshot data
         """
         try:
             # Get page height
             dimensions = await self.get_page_dimensions(page)
-            page_width = dimensions['width']
-            page_height = dimensions['height']            
+            page_width = dimensions["width"]
+            page_height = dimensions["height"]
             # page_height = await page.evaluate("document.documentElement.scrollHeight")
             # page_width = await page.evaluate("document.documentElement.scrollWidth")
 
@@ -1805,10 +1940,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         """
         Exports the current storage state (cookies, localStorage, sessionStorage)
         to a JSON file at the specified path.
-        
+
         Args:
             path (str): The path to save the storage state JSON file
-        
+
         Returns:
             dict: The exported storage state
         """
@@ -1826,33 +1961,35 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 tag="WARNING",
             )
 
-    async def robust_execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]:
+    async def robust_execute_user_script(
+        self, page: Page, js_code: Union[str, List[str]]
+    ) -> Dict[str, Any]:
         """
         Executes user-provided JavaScript code with proper error handling and context,
         supporting both synchronous and async user code, plus navigations.
-        
+
         How it works:
         1. Wait for load state 'domcontentloaded'
         2. If js_code is a string, execute it directly
         3. If js_code is a list, execute each element in sequence
-        4. Wait for load state 'networkidle'        
-        5. Return results   
-        
-        Args:    
+        4. Wait for load state 'networkidle'
+        5. Return results
+
+        Args:
             page (Page): The Playwright page instance
             js_code (Union[str, List[str]]): The JavaScript code to execute
-        
+
         Returns:
             Dict[str, Any]: The results of the execution
         """
         try:
-            await page.wait_for_load_state('domcontentloaded')
-            
+            await page.wait_for_load_state("domcontentloaded")
+
             if isinstance(js_code, str):
                 scripts = [js_code]
             else:
                 scripts = js_code
-            
+
             results = []
             for script in scripts:
                 try:
@@ -1861,7 +1998,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     # then wait for the new page to load before continuing
                     result = None
                     try:
-                        result = await page.evaluate(f"""
+                        result = await page.evaluate(
+                            f"""
                         (async () => {{
                             try {{
                                 {script}
@@ -1870,53 +2008,58 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                                 return {{ success: false, error: err.toString(), stack: err.stack }};
                             }}
                         }})();
-                        """)
+                        """
+                        )
                     except Error as e:
                         # If it's due to navigation destroying the context, handle gracefully
                         if "Execution context was destroyed" in str(e):
-                            self.logger.info("Navigation triggered by script, waiting for load state", tag="JS_EXEC")
+                            self.logger.info(
+                                "Navigation triggered by script, waiting for load state",
+                                tag="JS_EXEC",
+                            )
                             try:
-                                await page.wait_for_load_state('load', timeout=30000)
+                                await page.wait_for_load_state("load", timeout=30000)
                             except Error as nav_err:
                                 self.logger.warning(
                                     message="Navigation wait failed: {error}",
                                     tag="JS_EXEC",
-                                    params={"error": str(nav_err)}
+                                    params={"error": str(nav_err)},
                                 )
                             try:
-                                await page.wait_for_load_state('networkidle', timeout=30000)
+                                await page.wait_for_load_state(
+                                    "networkidle", timeout=30000
+                                )
                             except Error as nav_err:
                                 self.logger.warning(
                                     message="Network idle wait failed: {error}",
                                     tag="JS_EXEC",
-                                    params={"error": str(nav_err)}
+                                    params={"error": str(nav_err)},
                                 )
                             # Return partial success, or adapt as you see fit
                             result = {
                                 "success": True,
-                                "info": "Navigation triggered, ignoring context destroyed error"
+                                "info": "Navigation triggered, ignoring context destroyed error",
                             }
                         else:
                             # It's some other error, log and continue
                             self.logger.error(
                                 message="Playwright execution error: {error}",
                                 tag="JS_EXEC",
-                                params={"error": str(e)}
+                                params={"error": str(e)},
                             )
                             result = {"success": False, "error": str(e)}
-                    
+
                     # If we made it this far with no repeated error, do post-load waits
                     t1 = time.time()
                     try:
-                        await page.wait_for_load_state('domcontentloaded', timeout=5000)
-                        print("DOM content loaded after script execution in", time.time() - t1)
+                        await page.wait_for_load_state("domcontentloaded", timeout=5000)
                     except Error as e:
                         self.logger.warning(
                             message="DOM content load timeout: {error}",
                             tag="JS_EXEC",
-                            params={"error": str(e)}
+                            params={"error": str(e)},
                         )
-                    
+
                     # t1 = time.time()
                     # try:
                     #     await page.wait_for_load_state('networkidle', timeout=5000)
@@ -1935,46 +2078,49 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     self.logger.error(
                         message="Script chunk failed: {error}",
                         tag="JS_EXEC",
-                        params={"error": str(e)}
+                        params={"error": str(e)},
                     )
                     results.append({"success": False, "error": str(e)})
 
             return {"success": True, "results": results}
-        
+
         except Exception as e:
             self.logger.error(
                 message="Script execution failed: {error}",
                 tag="JS_EXEC",
-                params={"error": str(e)}
+                params={"error": str(e)},
             )
             return {"success": False, "error": str(e)}
 
-    async def execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]:
+    async def execute_user_script(
+        self, page: Page, js_code: Union[str, List[str]]
+    ) -> Dict[str, Any]:
         """
         Executes user-provided JavaScript code with proper error handling and context.
-        
+
         Args:
             page: Playwright page object
             js_code: Single JavaScript string or list of JavaScript code strings
-            
+
         Returns:
             Dict containing execution status and results/errors
         """
         try:
             # Ensure the page is ready for script execution
-            await page.wait_for_load_state('domcontentloaded')
-            
+            await page.wait_for_load_state("domcontentloaded")
+
             # Handle single script or multiple scripts
             if isinstance(js_code, str):
                 scripts = [js_code]
             else:
                 scripts = js_code
-                
+
             results = []
             for script in scripts:
                 try:
                     # Execute the script and wait for network idle
-                    result = await page.evaluate(f"""
+                    result = await page.evaluate(
+                        f"""
                         (() => {{
                             return new Promise((resolve) => {{
                                 try {{
@@ -2007,57 +2153,58 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                                 }}
                             }});
                         }})()
-                    """)
-                    
+                    """
+                    )
+
                     # Wait for network idle after script execution
                     t1 = time.time()
-                    await page.wait_for_load_state('domcontentloaded', timeout=5000)
-                    print("DOM content loaded after script execution in", time.time() - t1)
+                    await page.wait_for_load_state("domcontentloaded", timeout=5000)
+
 
                     t1 = time.time()
-                    await page.wait_for_load_state('networkidle', timeout=5000)
-                    print("Network idle after script execution in", time.time() - t1)
-                    
+                    await page.wait_for_load_state("networkidle", timeout=5000)
+
                     results.append(result if result else {"success": True})
-                    
+
                 except Error as e:
                     # Handle Playwright-specific errors
                     self.logger.error(
                         message="Playwright execution error: {error}",
                         tag="JS_EXEC",
-                        params={"error": str(e)}
+                        params={"error": str(e)},
                     )
                     results.append({"success": False, "error": str(e)})
-                    
+
             return {"success": True, "results": results}
-            
+
         except Exception as e:
             self.logger.error(
                 message="Script execution failed: {error}",
                 tag="JS_EXEC",
-                params={"error": str(e)}
+                params={"error": str(e)},
             )
             return {"success": False, "error": str(e)}
-            
+
         except Exception as e:
             self.logger.error(
                 message="Script execution failed: {error}",
                 tag="JS_EXEC",
-                params={"error": str(e)}
+                params={"error": str(e)},
             )
             return {"success": False, "error": str(e)}
 
     async def check_visibility(self, page):
         """
         Checks if an element is visible on the page.
-        
+
         Args:
             page: Playwright page object
-            
+
         Returns:
             Boolean indicating visibility
         """
-        return await page.evaluate("""
+        return await page.evaluate(
+            """
             () => {
                 const element = document.body;
                 if (!element) return false;
@@ -2067,31 +2214,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                                 style.opacity !== '0';
                 return isVisible;
             }
-        """)       
-        
+        """
+        )
+
     async def safe_scroll(self, page: Page, x: int, y: int, delay: float = 0.1):
         """
         Safely scroll the page with rendering time.
-        
+
         Args:
             page: Playwright page object
             x: Horizontal scroll position
             y: Vertical scroll position
         """
         result = await self.csp_scroll_to(page, x, y)
-        if result['success']:
+        if result["success"]:
             await page.wait_for_timeout(delay * 1000)
         return result
-            
+
     async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]:
         """
         Performs a CSP-compliant scroll operation and returns the result status.
-        
+
         Args:
             page: Playwright page object
             x: Horizontal scroll position
             y: Vertical scroll position
-            
+
         Returns:
             Dict containing scroll status and position information
         """
@@ -2125,67 +2273,68 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     }}
                 }}"""
             )
-            
-            if not result['success']:
+
+            if not result["success"]:
                 self.logger.warning(
                     message="Scroll operation failed: {error}",
                     tag="SCROLL",
-                    params={"error": result.get('error')}
+                    params={"error": result.get("error")},
                 )
-                
+
             return result
-            
+
         except Exception as e:
             self.logger.error(
                 message="Failed to execute scroll: {error}",
                 tag="SCROLL",
-                params={"error": str(e)}
+                params={"error": str(e)},
             )
-            return {
-                "success": False,
-                "error": str(e)
-            }
-        
+            return {"success": False, "error": str(e)}
+
     async def get_page_dimensions(self, page: Page):
         """
         Get the dimensions of the page.
-        
+
         Args:
             page: Playwright page object
-            
+
         Returns:
             Dict containing width and height of the page
         """
-        return await page.evaluate("""
+        return await page.evaluate(
+            """
             () => {
                 const {scrollWidth, scrollHeight} = document.documentElement;
                 return {width: scrollWidth, height: scrollHeight};
             }
-        """)
-    
+        """
+        )
+
     async def page_need_scroll(self, page: Page) -> bool:
         """
         Determine whether the page need to scroll
-        
+
         Args:
             page: Playwright page object
-            
+
         Returns:
             bool: True if page needs scrolling
         """
         try:
-            need_scroll = await page.evaluate("""
+            need_scroll = await page.evaluate(
+                """
             () => {
                 const scrollHeight = document.documentElement.scrollHeight;
                 const viewportHeight = window.innerHeight;
                 return scrollHeight > viewportHeight;
             }
-            """)
+            """
+            )
             return need_scroll
         except Exception as e:
             self.logger.warning(
                 message="Failed to check scroll need: {error}. Defaulting to True for safety.",
                 tag="SCROLL",
-                params={"error": str(e)}
+                params={"error": str(e)},
             )
-            return True  # Default to scrolling if check fails
\ No newline at end of file
+            return True  # Default to scrolling if check fails
diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py
index aed9c76b..b0c20f29 100644
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -1,27 +1,30 @@
-import os, sys
+import os
 from pathlib import Path
 import aiosqlite
 import asyncio
-from typing import Optional, Tuple, Dict
+from typing import Optional, Dict
 from contextlib import asynccontextmanager
 import logging
 import json  # Added for serialization/deserialization
 from .utils import ensure_content_dirs, generate_content_hash
 from .models import CrawlResult, MarkdownGenerationResult
-import xxhash
 import aiofiles
-from .config import NEED_MIGRATION
 from .version_manager import VersionManager
 from .async_logger import AsyncLogger
 from .utils import get_error_context, create_box_message
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 
-base_directory = DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
+# Set up logging
+# logging.basicConfig(level=logging.INFO)
+# logger = logging.getLogger(__name__)
+# logger.setLevel(logging.INFO)
+
+base_directory = DB_PATH = os.path.join(
+    os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
+)
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(base_directory, "crawl4ai.db")
 
+
 class AsyncDatabaseManager:
     def __init__(self, pool_size: int = 10, max_retries: int = 3):
         self.db_path = DB_PATH
@@ -32,28 +35,27 @@ class AsyncDatabaseManager:
         self.pool_lock = asyncio.Lock()
         self.init_lock = asyncio.Lock()
         self.connection_semaphore = asyncio.Semaphore(pool_size)
-        self._initialized = False  
+        self._initialized = False
         self.version_manager = VersionManager()
         self.logger = AsyncLogger(
             log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"),
             verbose=False,
-            tag_width=10
+            tag_width=10,
         )
-        
-        
+
     async def initialize(self):
         """Initialize the database and connection pool"""
         try:
             self.logger.info("Initializing database", tag="INIT")
             # Ensure the database file exists
             os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
-            
+
             # Check if version update is needed
             needs_update = self.version_manager.needs_update()
-            
+
             # Always ensure base table exists
             await self.ainit_db()
-            
+
             # Verify the table exists
             async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
                 async with db.execute(
@@ -62,33 +64,37 @@ class AsyncDatabaseManager:
                     result = await cursor.fetchone()
                     if not result:
                         raise Exception("crawled_data table was not created")
-            
+
             # If version changed or fresh install, run updates
             if needs_update:
                 self.logger.info("New version detected, running updates", tag="INIT")
                 await self.update_db_schema()
-                from .migrations import run_migration  # Import here to avoid circular imports
+                from .migrations import (
+                    run_migration,
+                )  # Import here to avoid circular imports
+
                 await run_migration()
                 self.version_manager.update_version()  # Update stored version after successful migration
-                self.logger.success("Version update completed successfully", tag="COMPLETE")
+                self.logger.success(
+                    "Version update completed successfully", tag="COMPLETE"
+                )
             else:
-                self.logger.success("Database initialization completed successfully", tag="COMPLETE")
+                self.logger.success(
+                    "Database initialization completed successfully", tag="COMPLETE"
+                )
 
-                
         except Exception as e:
             self.logger.error(
                 message="Database initialization error: {error}",
                 tag="ERROR",
-                params={"error": str(e)}
+                params={"error": str(e)},
             )
             self.logger.info(
-                message="Database will be initialized on first use",
-                tag="INIT"
+                message="Database will be initialized on first use", tag="INIT"
             )
-                        
+
             raise
 
-            
     async def cleanup(self):
         """Cleanup connections when shutting down"""
         async with self.pool_lock:
@@ -107,6 +113,7 @@ class AsyncDatabaseManager:
                         self._initialized = True
                     except Exception as e:
                         import sys
+
                         error_context = get_error_context(sys.exc_info())
                         self.logger.error(
                             message="Database initialization failed:\n{error}\n\nContext:\n{context}\n\nTraceback:\n{traceback}",
@@ -115,41 +122,52 @@ class AsyncDatabaseManager:
                             params={
                                 "error": str(e),
                                 "context": error_context["code_context"],
-                                "traceback": error_context["full_traceback"]
-                            }
+                                "traceback": error_context["full_traceback"],
+                            },
                         )
                         raise
 
         await self.connection_semaphore.acquire()
         task_id = id(asyncio.current_task())
-        
+
         try:
             async with self.pool_lock:
                 if task_id not in self.connection_pool:
                     try:
-                        conn = await aiosqlite.connect(
-                            self.db_path,
-                            timeout=30.0
-                        )
-                        await conn.execute('PRAGMA journal_mode = WAL')
-                        await conn.execute('PRAGMA busy_timeout = 5000')
-                        
+                        conn = await aiosqlite.connect(self.db_path, timeout=30.0)
+                        await conn.execute("PRAGMA journal_mode = WAL")
+                        await conn.execute("PRAGMA busy_timeout = 5000")
+
                         # Verify database structure
-                        async with conn.execute("PRAGMA table_info(crawled_data)") as cursor:
+                        async with conn.execute(
+                            "PRAGMA table_info(crawled_data)"
+                        ) as cursor:
                             columns = await cursor.fetchall()
                             column_names = [col[1] for col in columns]
                             expected_columns = {
-                                'url', 'html', 'cleaned_html', 'markdown', 'extracted_content',
-                                'success', 'media', 'links', 'metadata', 'screenshot',
-                                'response_headers', 'downloaded_files'
+                                "url",
+                                "html",
+                                "cleaned_html",
+                                "markdown",
+                                "extracted_content",
+                                "success",
+                                "media",
+                                "links",
+                                "metadata",
+                                "screenshot",
+                                "response_headers",
+                                "downloaded_files",
                             }
                             missing_columns = expected_columns - set(column_names)
                             if missing_columns:
-                                raise ValueError(f"Database missing columns: {missing_columns}")
-                        
+                                raise ValueError(
+                                    f"Database missing columns: {missing_columns}"
+                                )
+
                         self.connection_pool[task_id] = conn
                     except Exception as e:
                         import sys
+
                         error_context = get_error_context(sys.exc_info())
                         error_message = (
                             f"Unexpected error in db get_connection at line {error_context['line_no']} "
@@ -158,7 +176,7 @@ class AsyncDatabaseManager:
                             f"Code context:\n{error_context['code_context']}"
                         )
                         self.logger.error(
-                            message=create_box_message(error_message, type= "error"),
+                            message=create_box_message(error_message, type="error"),
                         )
 
                         raise
@@ -167,6 +185,7 @@ class AsyncDatabaseManager:
 
         except Exception as e:
             import sys
+
             error_context = get_error_context(sys.exc_info())
             error_message = (
                 f"Unexpected error in db get_connection at line {error_context['line_no']} "
@@ -175,7 +194,7 @@ class AsyncDatabaseManager:
                 f"Code context:\n{error_context['code_context']}"
             )
             self.logger.error(
-                message=create_box_message(error_message, type= "error"),
+                message=create_box_message(error_message, type="error"),
             )
             raise
         finally:
@@ -185,7 +204,6 @@ class AsyncDatabaseManager:
                     del self.connection_pool[task_id]
             self.connection_semaphore.release()
 
-
     async def execute_with_retry(self, operation, *args):
         """Execute database operations with retry logic"""
         for attempt in range(self.max_retries):
@@ -200,18 +218,16 @@ class AsyncDatabaseManager:
                         message="Operation failed after {retries} attempts: {error}",
                         tag="ERROR",
                         force_verbose=True,
-                        params={
-                            "retries": self.max_retries,
-                            "error": str(e)
-                        }
-                    )                    
+                        params={"retries": self.max_retries, "error": str(e)},
+                    )
                     raise
                 await asyncio.sleep(1 * (attempt + 1))  # Exponential backoff
 
     async def ainit_db(self):
         """Initialize database schema"""
         async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
-            await db.execute('''
+            await db.execute(
+                """
                 CREATE TABLE IF NOT EXISTS crawled_data (
                     url TEXT PRIMARY KEY,
                     html TEXT,
@@ -226,21 +242,27 @@ class AsyncDatabaseManager:
                     response_headers TEXT DEFAULT "{}",
                     downloaded_files TEXT DEFAULT "{}"  -- New column added
                 )
-            ''')
+            """
+            )
             await db.commit()
 
-        
-
     async def update_db_schema(self):
         """Update database schema if needed"""
         async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
             cursor = await db.execute("PRAGMA table_info(crawled_data)")
             columns = await cursor.fetchall()
             column_names = [column[1] for column in columns]
-            
+
             # List of new columns to add
-            new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
-            
+            new_columns = [
+                "media",
+                "links",
+                "metadata",
+                "screenshot",
+                "response_headers",
+                "downloaded_files",
+            ]
+
             for column in new_columns:
                 if column not in column_names:
                     await self.aalter_db_add_column(column, db)
@@ -248,75 +270,95 @@ class AsyncDatabaseManager:
 
     async def aalter_db_add_column(self, new_column: str, db):
         """Add new column to the database"""
-        if new_column == 'response_headers':
-            await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
+        if new_column == "response_headers":
+            await db.execute(
+                f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"'
+            )
         else:
-            await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
+            await db.execute(
+                f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""'
+            )
         self.logger.info(
             message="Added column '{column}' to the database",
             tag="INIT",
-            params={"column": new_column}
-        )        
-
+            params={"column": new_column},
+        )
 
     async def aget_cached_url(self, url: str) -> Optional[CrawlResult]:
         """Retrieve cached URL data as CrawlResult"""
+
         async def _get(db):
             async with db.execute(
-                'SELECT * FROM crawled_data WHERE url = ?', (url,)
+                "SELECT * FROM crawled_data WHERE url = ?", (url,)
             ) as cursor:
                 row = await cursor.fetchone()
                 if not row:
                     return None
-                    
+
                 # Get column names
                 columns = [description[0] for description in cursor.description]
                 # Create dict from row data
                 row_dict = dict(zip(columns, row))
-                
+
                 # Load content from files using stored hashes
                 content_fields = {
-                    'html': row_dict['html'],
-                    'cleaned_html': row_dict['cleaned_html'],
-                    'markdown': row_dict['markdown'],
-                    'extracted_content': row_dict['extracted_content'],
-                    'screenshot': row_dict['screenshot'],
-                    'screenshots': row_dict['screenshot'],
+                    "html": row_dict["html"],
+                    "cleaned_html": row_dict["cleaned_html"],
+                    "markdown": row_dict["markdown"],
+                    "extracted_content": row_dict["extracted_content"],
+                    "screenshot": row_dict["screenshot"],
+                    "screenshots": row_dict["screenshot"],
                 }
-                
+
                 for field, hash_value in content_fields.items():
                     if hash_value:
                         content = await self._load_content(
-                            hash_value, 
-                            field.split('_')[0]  # Get content type from field name
+                            hash_value,
+                            field.split("_")[0],  # Get content type from field name
                         )
                         row_dict[field] = content or ""
                     else:
                         row_dict[field] = ""
 
                 # Parse JSON fields
-                json_fields = ['media', 'links', 'metadata', 'response_headers', 'markdown']
+                json_fields = [
+                    "media",
+                    "links",
+                    "metadata",
+                    "response_headers",
+                    "markdown",
+                ]
                 for field in json_fields:
                     try:
-                        row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {}
+                        row_dict[field] = (
+                            json.loads(row_dict[field]) if row_dict[field] else {}
+                        )
                     except json.JSONDecodeError:
-                        row_dict[field] = {}
+                        # Very UGLY, never mention it to me please
+                        if field == "markdown" and isinstance(row_dict[field], str):
+                            row_dict[field] = row_dict[field]
+                        else:
+                            row_dict[field] = {}
+
+                if isinstance(row_dict["markdown"], Dict):
+                    row_dict["markdown_v2"] = row_dict["markdown"]
+                    if row_dict["markdown"].get("raw_markdown"):
+                        row_dict["markdown"] = row_dict["markdown"]["raw_markdown"]
 
-                if isinstance(row_dict['markdown'], Dict):
-                    row_dict['markdown_v2'] = row_dict['markdown']
-                    if row_dict['markdown'].get('raw_markdown'):
-                        row_dict['markdown'] = row_dict['markdown']['raw_markdown']
-                
                 # Parse downloaded_files
                 try:
-                    row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else []
+                    row_dict["downloaded_files"] = (
+                        json.loads(row_dict["downloaded_files"])
+                        if row_dict["downloaded_files"]
+                        else []
+                    )
                 except json.JSONDecodeError:
-                    row_dict['downloaded_files'] = []
+                    row_dict["downloaded_files"] = []
 
                 # Remove any fields not in CrawlResult model
                 valid_fields = CrawlResult.__annotations__.keys()
                 filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields}
-                
+
                 return CrawlResult(**filtered_dict)
 
         try:
@@ -326,7 +368,7 @@ class AsyncDatabaseManager:
                 message="Error retrieving cached URL: {error}",
                 tag="ERROR",
                 force_verbose=True,
-                params={"error": str(e)}
+                params={"error": str(e)},
             )
             return None
 
@@ -334,37 +376,52 @@ class AsyncDatabaseManager:
         """Cache CrawlResult data"""
         # Store content files and get hashes
         content_map = {
-            'html': (result.html, 'html'),
-            'cleaned_html': (result.cleaned_html or "", 'cleaned'),
-            'markdown': None,
-            'extracted_content': (result.extracted_content or "", 'extracted'),
-            'screenshot': (result.screenshot or "", 'screenshots')
+            "html": (result.html, "html"),
+            "cleaned_html": (result.cleaned_html or "", "cleaned"),
+            "markdown": None,
+            "extracted_content": (result.extracted_content or "", "extracted"),
+            "screenshot": (result.screenshot or "", "screenshots"),
         }
 
         try:
             if isinstance(result.markdown, MarkdownGenerationResult):
-                content_map['markdown'] = (result.markdown.model_dump_json(), 'markdown')
-            elif hasattr(result, 'markdown_v2'):
-                content_map['markdown'] = (result.markdown_v2.model_dump_json(), 'markdown')
+                content_map["markdown"] = (
+                    result.markdown.model_dump_json(),
+                    "markdown",
+                )
+            elif hasattr(result, "markdown_v2"):
+                content_map["markdown"] = (
+                    result.markdown_v2.model_dump_json(),
+                    "markdown",
+                )
             elif isinstance(result.markdown, str):
                 markdown_result = MarkdownGenerationResult(raw_markdown=result.markdown)
-                content_map['markdown'] = (markdown_result.model_dump_json(), 'markdown')
+                content_map["markdown"] = (
+                    markdown_result.model_dump_json(),
+                    "markdown",
+                )
             else:
-                content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
+                content_map["markdown"] = (
+                    MarkdownGenerationResult().model_dump_json(),
+                    "markdown",
+                )
         except Exception as e:
             self.logger.warning(
-                message=f"Error processing markdown content: {str(e)}",
-                tag="WARNING"
+                message=f"Error processing markdown content: {str(e)}", tag="WARNING"
             )
             # Fallback to empty markdown result
-            content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
-        
+            content_map["markdown"] = (
+                MarkdownGenerationResult().model_dump_json(),
+                "markdown",
+            )
+
         content_hashes = {}
         for field, (content, content_type) in content_map.items():
             content_hashes[field] = await self._store_content(content, content_type)
 
         async def _cache(db):
-            await db.execute('''
+            await db.execute(
+                """
                 INSERT INTO crawled_data (
                     url, html, cleaned_html, markdown,
                     extracted_content, success, media, links, metadata,
@@ -383,20 +440,22 @@ class AsyncDatabaseManager:
                     screenshot = excluded.screenshot,
                     response_headers = excluded.response_headers,
                     downloaded_files = excluded.downloaded_files
-            ''', (
-                result.url,
-                content_hashes['html'],
-                content_hashes['cleaned_html'],
-                content_hashes['markdown'],
-                content_hashes['extracted_content'],
-                result.success,
-                json.dumps(result.media),
-                json.dumps(result.links),
-                json.dumps(result.metadata or {}),
-                content_hashes['screenshot'],
-                json.dumps(result.response_headers or {}),
-                json.dumps(result.downloaded_files or [])
-            ))
+            """,
+                (
+                    result.url,
+                    content_hashes["html"],
+                    content_hashes["cleaned_html"],
+                    content_hashes["markdown"],
+                    content_hashes["extracted_content"],
+                    result.success,
+                    json.dumps(result.media),
+                    json.dumps(result.links),
+                    json.dumps(result.metadata or {}),
+                    content_hashes["screenshot"],
+                    json.dumps(result.response_headers or {}),
+                    json.dumps(result.downloaded_files or []),
+                ),
+            )
 
         try:
             await self.execute_with_retry(_cache)
@@ -405,14 +464,14 @@ class AsyncDatabaseManager:
                 message="Error caching URL: {error}",
                 tag="ERROR",
                 force_verbose=True,
-                params={"error": str(e)}
+                params={"error": str(e)},
             )
-            
 
     async def aget_total_count(self) -> int:
         """Get total number of cached URLs"""
+
         async def _count(db):
-            async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor:
+            async with db.execute("SELECT COUNT(*) FROM crawled_data") as cursor:
                 result = await cursor.fetchone()
                 return result[0] if result else 0
 
@@ -423,14 +482,15 @@ class AsyncDatabaseManager:
                 message="Error getting total count: {error}",
                 tag="ERROR",
                 force_verbose=True,
-                params={"error": str(e)}
+                params={"error": str(e)},
             )
             return 0
 
     async def aclear_db(self):
         """Clear all data from the database"""
+
         async def _clear(db):
-            await db.execute('DELETE FROM crawled_data')
+            await db.execute("DELETE FROM crawled_data")
 
         try:
             await self.execute_with_retry(_clear)
@@ -439,13 +499,14 @@ class AsyncDatabaseManager:
                 message="Error clearing database: {error}",
                 tag="ERROR",
                 force_verbose=True,
-                params={"error": str(e)}
+                params={"error": str(e)},
             )
 
     async def aflush_db(self):
         """Drop the entire table"""
+
         async def _flush(db):
-            await db.execute('DROP TABLE IF EXISTS crawled_data')
+            await db.execute("DROP TABLE IF EXISTS crawled_data")
 
         try:
             await self.execute_with_retry(_flush)
@@ -454,42 +515,44 @@ class AsyncDatabaseManager:
                 message="Error flushing database: {error}",
                 tag="ERROR",
                 force_verbose=True,
-                params={"error": str(e)}
+                params={"error": str(e)},
             )
-            
-                
+
     async def _store_content(self, content: str, content_type: str) -> str:
         """Store content in filesystem and return hash"""
         if not content:
             return ""
-            
+
         content_hash = generate_content_hash(content)
         file_path = os.path.join(self.content_paths[content_type], content_hash)
-        
+
         # Only write if file doesn't exist
         if not os.path.exists(file_path):
-            async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
+            async with aiofiles.open(file_path, "w", encoding="utf-8") as f:
                 await f.write(content)
-                
+
         return content_hash
 
-    async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]:
+    async def _load_content(
+        self, content_hash: str, content_type: str
+    ) -> Optional[str]:
         """Load content from filesystem by hash"""
         if not content_hash:
             return None
-            
+
         file_path = os.path.join(self.content_paths[content_type], content_hash)
         try:
-            async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
+            async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
                 return await f.read()
         except:
             self.logger.error(
                 message="Failed to load content: {file_path}",
                 tag="ERROR",
                 force_verbose=True,
-                params={"file_path": file_path}
+                params={"file_path": file_path},
             )
             return None
 
+
 # Create a singleton instance
 async_db_manager = AsyncDatabaseManager()
diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py
new file mode 100644
index 00000000..ed40b8b4
--- /dev/null
+++ b/crawl4ai/async_dispatcher.py
@@ -0,0 +1,647 @@
+from typing import Dict, Optional, List, Tuple
+from .async_configs import CrawlerRunConfig
+from .models import (
+    CrawlResult,
+    CrawlerTaskResult,
+    CrawlStatus,
+    DisplayMode,
+    CrawlStats,
+    DomainState,
+)
+
+from rich.live import Live
+from rich.table import Table
+from rich.console import Console
+from rich import box
+from datetime import datetime, timedelta
+from collections.abc import AsyncGenerator
+import time
+import psutil
+import asyncio
+import uuid
+
+from urllib.parse import urlparse
+import random
+from abc import ABC, abstractmethod
+
+
+
+class RateLimiter:
+    def __init__(
+        self,
+        base_delay: Tuple[float, float] = (1.0, 3.0),
+        max_delay: float = 60.0,
+        max_retries: int = 3,
+        rate_limit_codes: List[int] = None,
+    ):
+        self.base_delay = base_delay
+        self.max_delay = max_delay
+        self.max_retries = max_retries
+        self.rate_limit_codes = rate_limit_codes or [429, 503]
+        self.domains: Dict[str, DomainState] = {}
+
+    def get_domain(self, url: str) -> str:
+        return urlparse(url).netloc
+
+    async def wait_if_needed(self, url: str) -> None:
+        domain = self.get_domain(url)
+        state = self.domains.get(domain)
+
+        if not state:
+            self.domains[domain] = DomainState()
+            state = self.domains[domain]
+
+        now = time.time()
+        if state.last_request_time:
+            wait_time = max(0, state.current_delay - (now - state.last_request_time))
+            if wait_time > 0:
+                await asyncio.sleep(wait_time)
+
+        # Random delay within base range if no current delay
+        if state.current_delay == 0:
+            state.current_delay = random.uniform(*self.base_delay)
+
+        state.last_request_time = time.time()
+
+    def update_delay(self, url: str, status_code: int) -> bool:
+        domain = self.get_domain(url)
+        state = self.domains[domain]
+
+        if status_code in self.rate_limit_codes:
+            state.fail_count += 1
+            if state.fail_count > self.max_retries:
+                return False
+
+            # Exponential backoff with random jitter
+            state.current_delay = min(
+                state.current_delay * 2 * random.uniform(0.75, 1.25), self.max_delay
+            )
+        else:
+            # Gradually reduce delay on success
+            state.current_delay = max(
+                random.uniform(*self.base_delay), state.current_delay * 0.75
+            )
+            state.fail_count = 0
+
+        return True
+
+
+class CrawlerMonitor:
+    def __init__(
+        self,
+        max_visible_rows: int = 15,
+        display_mode: DisplayMode = DisplayMode.DETAILED,
+    ):
+        self.console = Console()
+        self.max_visible_rows = max_visible_rows
+        self.display_mode = display_mode
+        self.stats: Dict[str, CrawlStats] = {}
+        self.process = psutil.Process()
+        self.start_time = datetime.now()
+        self.live = Live(self._create_table(), refresh_per_second=2)
+
+    def start(self):
+        self.live.start()
+
+    def stop(self):
+        self.live.stop()
+
+    def add_task(self, task_id: str, url: str):
+        self.stats[task_id] = CrawlStats(
+            task_id=task_id, url=url, status=CrawlStatus.QUEUED
+        )
+        self.live.update(self._create_table())
+
+    def update_task(self, task_id: str, **kwargs):
+        if task_id in self.stats:
+            for key, value in kwargs.items():
+                setattr(self.stats[task_id], key, value)
+            self.live.update(self._create_table())
+
+    def _create_aggregated_table(self) -> Table:
+        """Creates a compact table showing only aggregated statistics"""
+        table = Table(
+            box=box.ROUNDED,
+            title="Crawler Status Overview",
+            title_style="bold magenta",
+            header_style="bold blue",
+            show_lines=True,
+        )
+
+        # Calculate statistics
+        total_tasks = len(self.stats)
+        queued = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED
+        )
+        in_progress = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
+        )
+        completed = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
+        )
+        failed = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
+        )
+
+        # Memory statistics
+        current_memory = self.process.memory_info().rss / (1024 * 1024)
+        total_task_memory = sum(stat.memory_usage for stat in self.stats.values())
+        peak_memory = max(
+            (stat.peak_memory for stat in self.stats.values()), default=0.0
+        )
+
+        # Duration
+        duration = datetime.now() - self.start_time
+
+        # Create status row
+        table.add_column("Status", style="bold cyan")
+        table.add_column("Count", justify="right")
+        table.add_column("Percentage", justify="right")
+
+        table.add_row("Total Tasks", str(total_tasks), "100%")
+        table.add_row(
+            "[yellow]In Queue[/yellow]",
+            str(queued),
+            f"{(queued/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
+        )
+        table.add_row(
+            "[blue]In Progress[/blue]",
+            str(in_progress),
+            f"{(in_progress/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
+        )
+        table.add_row(
+            "[green]Completed[/green]",
+            str(completed),
+            f"{(completed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
+        )
+        table.add_row(
+            "[red]Failed[/red]",
+            str(failed),
+            f"{(failed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
+        )
+
+        # Add memory information
+        table.add_section()
+        table.add_row(
+            "[magenta]Current Memory[/magenta]", f"{current_memory:.1f} MB", ""
+        )
+        table.add_row(
+            "[magenta]Total Task Memory[/magenta]", f"{total_task_memory:.1f} MB", ""
+        )
+        table.add_row(
+            "[magenta]Peak Task Memory[/magenta]", f"{peak_memory:.1f} MB", ""
+        )
+        table.add_row(
+            "[yellow]Runtime[/yellow]",
+            str(timedelta(seconds=int(duration.total_seconds()))),
+            "",
+        )
+
+        return table
+
+    def _create_detailed_table(self) -> Table:
+        table = Table(
+            box=box.ROUNDED,
+            title="Crawler Performance Monitor",
+            title_style="bold magenta",
+            header_style="bold blue",
+        )
+
+        # Add columns
+        table.add_column("Task ID", style="cyan", no_wrap=True)
+        table.add_column("URL", style="cyan", no_wrap=True)
+        table.add_column("Status", style="bold")
+        table.add_column("Memory (MB)", justify="right")
+        table.add_column("Peak (MB)", justify="right")
+        table.add_column("Duration", justify="right")
+        table.add_column("Info", style="italic")
+
+        # Add summary row
+        total_memory = sum(stat.memory_usage for stat in self.stats.values())
+        active_count = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
+        )
+        completed_count = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
+        )
+        failed_count = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
+        )
+
+        table.add_row(
+            "[bold yellow]SUMMARY",
+            f"Total: {len(self.stats)}",
+            f"Active: {active_count}",
+            f"{total_memory:.1f}",
+            f"{self.process.memory_info().rss / (1024 * 1024):.1f}",
+            str(
+                timedelta(
+                    seconds=int((datetime.now() - self.start_time).total_seconds())
+                )
+            ),
+            f"✓{completed_count} ✗{failed_count}",
+            style="bold",
+        )
+
+        table.add_section()
+
+        # Add rows for each task
+        visible_stats = sorted(
+            self.stats.values(),
+            key=lambda x: (
+                x.status != CrawlStatus.IN_PROGRESS,
+                x.status != CrawlStatus.QUEUED,
+                x.end_time or datetime.max,
+            ),
+        )[: self.max_visible_rows]
+
+        for stat in visible_stats:
+            status_style = {
+                CrawlStatus.QUEUED: "white",
+                CrawlStatus.IN_PROGRESS: "yellow",
+                CrawlStatus.COMPLETED: "green",
+                CrawlStatus.FAILED: "red",
+            }[stat.status]
+
+            table.add_row(
+                stat.task_id[:8],  # Show first 8 chars of task ID
+                stat.url[:40] + "..." if len(stat.url) > 40 else stat.url,
+                f"[{status_style}]{stat.status.value}[/{status_style}]",
+                f"{stat.memory_usage:.1f}",
+                f"{stat.peak_memory:.1f}",
+                stat.duration,
+                stat.error_message[:40] if stat.error_message else "",
+            )
+
+        return table
+
+    def _create_table(self) -> Table:
+        """Creates the appropriate table based on display mode"""
+        if self.display_mode == DisplayMode.AGGREGATED:
+            return self._create_aggregated_table()
+        return self._create_detailed_table()
+
+
+class BaseDispatcher(ABC):
+    def __init__(
+        self,
+        rate_limiter: Optional[RateLimiter] = None,
+        monitor: Optional[CrawlerMonitor] = None,
+    ):
+        self.crawler = None
+        self._domain_last_hit: Dict[str, float] = {}
+        self.concurrent_sessions = 0
+        self.rate_limiter = rate_limiter
+        self.monitor = monitor
+
+    @abstractmethod
+    async def crawl_url(
+        self,
+        url: str,
+        config: CrawlerRunConfig,
+        task_id: str,
+        monitor: Optional[CrawlerMonitor] = None,
+    ) -> CrawlerTaskResult:
+        pass
+
+    @abstractmethod
+    async def run_urls(
+        self,
+        urls: List[str],
+        crawler: "AsyncWebCrawler",  # noqa: F821
+        config: CrawlerRunConfig,
+        monitor: Optional[CrawlerMonitor] = None,
+    ) -> List[CrawlerTaskResult]:
+        pass
+
+
+class MemoryAdaptiveDispatcher(BaseDispatcher):
+    def __init__(
+        self,
+        memory_threshold_percent: float = 90.0,
+        check_interval: float = 1.0,
+        max_session_permit: int = 20,
+        memory_wait_timeout: float = 300.0,  # 5 minutes default timeout
+        rate_limiter: Optional[RateLimiter] = None,
+        monitor: Optional[CrawlerMonitor] = None,
+    ):
+        super().__init__(rate_limiter, monitor)
+        self.memory_threshold_percent = memory_threshold_percent
+        self.check_interval = check_interval
+        self.max_session_permit = max_session_permit
+        self.memory_wait_timeout = memory_wait_timeout
+        self.result_queue = asyncio.Queue()  # Queue for storing results
+
+    async def crawl_url(
+        self,
+        url: str,
+        config: CrawlerRunConfig,
+        task_id: str,
+    ) -> CrawlerTaskResult:
+        start_time = datetime.now()
+        error_message = ""
+        memory_usage = peak_memory = 0.0
+
+        try:
+            if self.monitor:
+                self.monitor.update_task(
+                    task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
+                )
+            self.concurrent_sessions += 1
+
+            if self.rate_limiter:
+                await self.rate_limiter.wait_if_needed(url)
+
+            process = psutil.Process()
+            start_memory = process.memory_info().rss / (1024 * 1024)
+            result = await self.crawler.arun(url, config=config, session_id=task_id)
+            end_memory = process.memory_info().rss / (1024 * 1024)
+
+            memory_usage = peak_memory = end_memory - start_memory
+
+            if self.rate_limiter and result.status_code:
+                if not self.rate_limiter.update_delay(url, result.status_code):
+                    error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
+                    if self.monitor:
+                        self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
+                    result = CrawlerTaskResult(
+                        task_id=task_id,
+                        url=url,
+                        result=result,
+                        memory_usage=memory_usage,
+                        peak_memory=peak_memory,
+                        start_time=start_time,
+                        end_time=datetime.now(),
+                        error_message=error_message,
+                    )
+                    await self.result_queue.put(result)
+                    return result
+
+            if not result.success:
+                error_message = result.error_message
+                if self.monitor:
+                    self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
+            elif self.monitor:
+                self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
+
+        except Exception as e:
+            error_message = str(e)
+            if self.monitor:
+                self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
+            result = CrawlResult(
+                url=url, html="", metadata={}, success=False, error_message=str(e)
+            )
+
+        finally:
+            end_time = datetime.now()
+            if self.monitor:
+                self.monitor.update_task(
+                    task_id,
+                    end_time=end_time,
+                    memory_usage=memory_usage,
+                    peak_memory=peak_memory,
+                    error_message=error_message,
+                )
+            self.concurrent_sessions -= 1
+
+        return CrawlerTaskResult(
+            task_id=task_id,
+            url=url,
+            result=result,
+            memory_usage=memory_usage,
+            peak_memory=peak_memory,
+            start_time=start_time,
+            end_time=end_time,
+            error_message=error_message,
+        )
+
+    async def run_urls(
+        self,
+        urls: List[str],
+        crawler: "AsyncWebCrawler",  # noqa: F821
+        config: CrawlerRunConfig,
+        ) -> List[CrawlerTaskResult]:
+            self.crawler = crawler
+
+            if self.monitor:
+                self.monitor.start()
+
+            try:
+                pending_tasks = []
+                active_tasks = []
+                task_queue = []
+
+                for url in urls:
+                    task_id = str(uuid.uuid4())
+                    if self.monitor:
+                        self.monitor.add_task(task_id, url)
+                    task_queue.append((url, task_id))
+
+                while task_queue or active_tasks:
+                    wait_start_time = time.time()
+                    while len(active_tasks) < self.max_session_permit and task_queue:
+                        if psutil.virtual_memory().percent >= self.memory_threshold_percent:
+                            # Check if we've exceeded the timeout
+                            if time.time() - wait_start_time > self.memory_wait_timeout:
+                                raise MemoryError(
+                                    f"Memory usage above threshold ({self.memory_threshold_percent}%) for more than {self.memory_wait_timeout} seconds"
+                                )
+                            await asyncio.sleep(self.check_interval)
+                            continue
+
+                        url, task_id = task_queue.pop(0)
+                        task = asyncio.create_task(self.crawl_url(url, config, task_id))
+                        active_tasks.append(task)
+
+                    if not active_tasks:
+                        await asyncio.sleep(self.check_interval)
+                        continue
+
+                    done, pending = await asyncio.wait(
+                        active_tasks, return_when=asyncio.FIRST_COMPLETED
+                    )
+
+                    pending_tasks.extend(done)
+                    active_tasks = list(pending)
+
+                return await asyncio.gather(*pending_tasks)
+            finally:
+                if self.monitor:
+                    self.monitor.stop()
+
+    async def run_urls_stream(
+        self,
+        urls: List[str],
+        crawler: "AsyncWebCrawler",
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlerTaskResult, None]:
+        self.crawler = crawler
+        if self.monitor:
+            self.monitor.start()
+
+        try:
+            active_tasks = []
+            task_queue = []
+            completed_count = 0
+            total_urls = len(urls)
+
+            # Initialize task queue
+            for url in urls:
+                task_id = str(uuid.uuid4())
+                if self.monitor:
+                    self.monitor.add_task(task_id, url)
+                task_queue.append((url, task_id))
+
+            while completed_count < total_urls:
+                # Start new tasks if memory permits
+                while len(active_tasks) < self.max_session_permit and task_queue:
+                    if psutil.virtual_memory().percent >= self.memory_threshold_percent:
+                        await asyncio.sleep(self.check_interval)
+                        continue
+
+                    url, task_id = task_queue.pop(0)
+                    task = asyncio.create_task(self.crawl_url(url, config, task_id))
+                    active_tasks.append(task)
+
+                if not active_tasks and not task_queue:
+                    break
+
+                # Wait for any task to complete and yield results
+                if active_tasks:
+                    done, pending = await asyncio.wait(
+                        active_tasks,
+                        timeout=0.1,
+                        return_when=asyncio.FIRST_COMPLETED
+                    )
+                    for completed_task in done:
+                        result = await completed_task
+                        completed_count += 1
+                        yield result
+                    active_tasks = list(pending)
+                else:
+                    await asyncio.sleep(self.check_interval)
+
+        finally:
+            if self.monitor:
+                self.monitor.stop()
+
+class SemaphoreDispatcher(BaseDispatcher):
+    def __init__(
+        self,
+        semaphore_count: int = 5,
+        max_session_permit: int = 20,
+        rate_limiter: Optional[RateLimiter] = None,
+        monitor: Optional[CrawlerMonitor] = None,
+    ):
+        super().__init__(rate_limiter, monitor)
+        self.semaphore_count = semaphore_count
+        self.max_session_permit = max_session_permit
+
+    async def crawl_url(
+        self,
+        url: str,
+        config: CrawlerRunConfig,
+        task_id: str,
+        semaphore: asyncio.Semaphore = None,
+    ) -> CrawlerTaskResult:
+        start_time = datetime.now()
+        error_message = ""
+        memory_usage = peak_memory = 0.0
+
+        try:
+            if self.monitor:
+                self.monitor.update_task(
+                    task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
+                )
+
+            if self.rate_limiter:
+                await self.rate_limiter.wait_if_needed(url)
+
+            async with semaphore:
+                process = psutil.Process()
+                start_memory = process.memory_info().rss / (1024 * 1024)
+                result = await self.crawler.arun(url, config=config, session_id=task_id)
+                end_memory = process.memory_info().rss / (1024 * 1024)
+
+                memory_usage = peak_memory = end_memory - start_memory
+
+                if self.rate_limiter and result.status_code:
+                    if not self.rate_limiter.update_delay(url, result.status_code):
+                        error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
+                        if self.monitor:
+                            self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
+                        return CrawlerTaskResult(
+                            task_id=task_id,
+                            url=url,
+                            result=result,
+                            memory_usage=memory_usage,
+                            peak_memory=peak_memory,
+                            start_time=start_time,
+                            end_time=datetime.now(),
+                            error_message=error_message,
+                        )
+
+                if not result.success:
+                    error_message = result.error_message
+                    if self.monitor:
+                        self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
+                elif self.monitor:
+                    self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
+
+        except Exception as e:
+            error_message = str(e)
+            if self.monitor:
+                self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
+            result = CrawlResult(
+                url=url, html="", metadata={}, success=False, error_message=str(e)
+            )
+
+        finally:
+            end_time = datetime.now()
+            if self.monitor:
+                self.monitor.update_task(
+                    task_id,
+                    end_time=end_time,
+                    memory_usage=memory_usage,
+                    peak_memory=peak_memory,
+                    error_message=error_message,
+                )
+
+        return CrawlerTaskResult(
+            task_id=task_id,
+            url=url,
+            result=result,
+            memory_usage=memory_usage,
+            peak_memory=peak_memory,
+            start_time=start_time,
+            end_time=end_time,
+            error_message=error_message,
+        )
+
+    async def run_urls(
+        self,
+        crawler: "AsyncWebCrawler",  # noqa: F821
+        urls: List[str],
+        config: CrawlerRunConfig,
+    ) -> List[CrawlerTaskResult]:
+        self.crawler = crawler
+        if self.monitor:
+            self.monitor.start()
+
+        try:
+            semaphore = asyncio.Semaphore(self.semaphore_count)
+            tasks = []
+
+            for url in urls:
+                task_id = str(uuid.uuid4())
+                if self.monitor:
+                    self.monitor.add_task(task_id, url)
+                task = asyncio.create_task(
+                    self.crawl_url(url, config, task_id, semaphore)
+                )
+                tasks.append(task)
+
+            return await asyncio.gather(*tasks, return_exceptions=True)
+        finally:
+            if self.monitor:
+                self.monitor.stop()
diff --git a/crawl4ai/async_dispatcher_.py b/crawl4ai/async_dispatcher_.py
new file mode 100644
index 00000000..64578bf6
--- /dev/null
+++ b/crawl4ai/async_dispatcher_.py
@@ -0,0 +1,588 @@
+from typing import Dict, Optional, List, Tuple
+from .async_configs import CrawlerRunConfig
+from .models import (
+    CrawlResult,
+    CrawlerTaskResult,
+    CrawlStatus,
+    DisplayMode,
+    CrawlStats,
+    DomainState,
+)
+
+from rich.live import Live
+from rich.table import Table
+from rich.console import Console
+from rich import box
+from datetime import datetime, timedelta
+
+import time
+import psutil
+import asyncio
+import uuid
+
+from urllib.parse import urlparse
+import random
+from abc import ABC, abstractmethod
+
+
+class RateLimiter:
+    def __init__(
+        self,
+        base_delay: Tuple[float, float] = (1.0, 3.0),
+        max_delay: float = 60.0,
+        max_retries: int = 3,
+        rate_limit_codes: List[int] = None,
+    ):
+        self.base_delay = base_delay
+        self.max_delay = max_delay
+        self.max_retries = max_retries
+        self.rate_limit_codes = rate_limit_codes or [429, 503]
+        self.domains: Dict[str, DomainState] = {}
+
+    def get_domain(self, url: str) -> str:
+        return urlparse(url).netloc
+
+    async def wait_if_needed(self, url: str) -> None:
+        domain = self.get_domain(url)
+        state = self.domains.get(domain)
+
+        if not state:
+            self.domains[domain] = DomainState()
+            state = self.domains[domain]
+
+        now = time.time()
+        if state.last_request_time:
+            wait_time = max(0, state.current_delay - (now - state.last_request_time))
+            if wait_time > 0:
+                await asyncio.sleep(wait_time)
+
+        # Random delay within base range if no current delay
+        if state.current_delay == 0:
+            state.current_delay = random.uniform(*self.base_delay)
+
+        state.last_request_time = time.time()
+
+    def update_delay(self, url: str, status_code: int) -> bool:
+        domain = self.get_domain(url)
+        state = self.domains[domain]
+
+        if status_code in self.rate_limit_codes:
+            state.fail_count += 1
+            if state.fail_count > self.max_retries:
+                return False
+
+            # Exponential backoff with random jitter
+            state.current_delay = min(
+                state.current_delay * 2 * random.uniform(0.75, 1.25), self.max_delay
+            )
+        else:
+            # Gradually reduce delay on success
+            state.current_delay = max(
+                random.uniform(*self.base_delay), state.current_delay * 0.75
+            )
+            state.fail_count = 0
+
+        return True
+
+
+class CrawlerMonitor:
+    def __init__(
+        self,
+        max_visible_rows: int = 15,
+        display_mode: DisplayMode = DisplayMode.DETAILED,
+    ):
+        self.console = Console()
+        self.max_visible_rows = max_visible_rows
+        self.display_mode = display_mode
+        self.stats: Dict[str, CrawlStats] = {}
+        self.process = psutil.Process()
+        self.start_time = datetime.now()
+        self.live = Live(self._create_table(), refresh_per_second=2)
+
+    def start(self):
+        self.live.start()
+
+    def stop(self):
+        self.live.stop()
+
+    def add_task(self, task_id: str, url: str):
+        self.stats[task_id] = CrawlStats(
+            task_id=task_id, url=url, status=CrawlStatus.QUEUED
+        )
+        self.live.update(self._create_table())
+
+    def update_task(self, task_id: str, **kwargs):
+        if task_id in self.stats:
+            for key, value in kwargs.items():
+                setattr(self.stats[task_id], key, value)
+            self.live.update(self._create_table())
+
+    def _create_aggregated_table(self) -> Table:
+        """Creates a compact table showing only aggregated statistics"""
+        table = Table(
+            box=box.ROUNDED,
+            title="Crawler Status Overview",
+            title_style="bold magenta",
+            header_style="bold blue",
+            show_lines=True,
+        )
+
+        # Calculate statistics
+        total_tasks = len(self.stats)
+        queued = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED
+        )
+        in_progress = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
+        )
+        completed = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
+        )
+        failed = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
+        )
+
+        # Memory statistics
+        current_memory = self.process.memory_info().rss / (1024 * 1024)
+        total_task_memory = sum(stat.memory_usage for stat in self.stats.values())
+        peak_memory = max(
+            (stat.peak_memory for stat in self.stats.values()), default=0.0
+        )
+
+        # Duration
+        duration = datetime.now() - self.start_time
+
+        # Create status row
+        table.add_column("Status", style="bold cyan")
+        table.add_column("Count", justify="right")
+        table.add_column("Percentage", justify="right")
+
+        table.add_row("Total Tasks", str(total_tasks), "100%")
+        table.add_row(
+            "[yellow]In Queue[/yellow]",
+            str(queued),
+            f"{(queued/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
+        )
+        table.add_row(
+            "[blue]In Progress[/blue]",
+            str(in_progress),
+            f"{(in_progress/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
+        )
+        table.add_row(
+            "[green]Completed[/green]",
+            str(completed),
+            f"{(completed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
+        )
+        table.add_row(
+            "[red]Failed[/red]",
+            str(failed),
+            f"{(failed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
+        )
+
+        # Add memory information
+        table.add_section()
+        table.add_row(
+            "[magenta]Current Memory[/magenta]", f"{current_memory:.1f} MB", ""
+        )
+        table.add_row(
+            "[magenta]Total Task Memory[/magenta]", f"{total_task_memory:.1f} MB", ""
+        )
+        table.add_row(
+            "[magenta]Peak Task Memory[/magenta]", f"{peak_memory:.1f} MB", ""
+        )
+        table.add_row(
+            "[yellow]Runtime[/yellow]",
+            str(timedelta(seconds=int(duration.total_seconds()))),
+            "",
+        )
+
+        return table
+
+    def _create_detailed_table(self) -> Table:
+        table = Table(
+            box=box.ROUNDED,
+            title="Crawler Performance Monitor",
+            title_style="bold magenta",
+            header_style="bold blue",
+        )
+
+        # Add columns
+        table.add_column("Task ID", style="cyan", no_wrap=True)
+        table.add_column("URL", style="cyan", no_wrap=True)
+        table.add_column("Status", style="bold")
+        table.add_column("Memory (MB)", justify="right")
+        table.add_column("Peak (MB)", justify="right")
+        table.add_column("Duration", justify="right")
+        table.add_column("Info", style="italic")
+
+        # Add summary row
+        total_memory = sum(stat.memory_usage for stat in self.stats.values())
+        active_count = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
+        )
+        completed_count = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
+        )
+        failed_count = sum(
+            1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
+        )
+
+        table.add_row(
+            "[bold yellow]SUMMARY",
+            f"Total: {len(self.stats)}",
+            f"Active: {active_count}",
+            f"{total_memory:.1f}",
+            f"{self.process.memory_info().rss / (1024 * 1024):.1f}",
+            str(
+                timedelta(
+                    seconds=int((datetime.now() - self.start_time).total_seconds())
+                )
+            ),
+            f"✓{completed_count} ✗{failed_count}",
+            style="bold",
+        )
+
+        table.add_section()
+
+        # Add rows for each task
+        visible_stats = sorted(
+            self.stats.values(),
+            key=lambda x: (
+                x.status != CrawlStatus.IN_PROGRESS,
+                x.status != CrawlStatus.QUEUED,
+                x.end_time or datetime.max,
+            ),
+        )[: self.max_visible_rows]
+
+        for stat in visible_stats:
+            status_style = {
+                CrawlStatus.QUEUED: "white",
+                CrawlStatus.IN_PROGRESS: "yellow",
+                CrawlStatus.COMPLETED: "green",
+                CrawlStatus.FAILED: "red",
+            }[stat.status]
+
+            table.add_row(
+                stat.task_id[:8],  # Show first 8 chars of task ID
+                stat.url[:40] + "..." if len(stat.url) > 40 else stat.url,
+                f"[{status_style}]{stat.status.value}[/{status_style}]",
+                f"{stat.memory_usage:.1f}",
+                f"{stat.peak_memory:.1f}",
+                stat.duration,
+                stat.error_message[:40] if stat.error_message else "",
+            )
+
+        return table
+
+    def _create_table(self) -> Table:
+        """Creates the appropriate table based on display mode"""
+        if self.display_mode == DisplayMode.AGGREGATED:
+            return self._create_aggregated_table()
+        return self._create_detailed_table()
+
+
+class BaseDispatcher(ABC):
+    def __init__(
+        self,
+        rate_limiter: Optional[RateLimiter] = None,
+        monitor: Optional[CrawlerMonitor] = None,
+    ):
+        self.crawler = None
+        self._domain_last_hit: Dict[str, float] = {}
+        self.concurrent_sessions = 0
+        self.rate_limiter = rate_limiter
+        self.monitor = monitor
+
+    @abstractmethod
+    async def crawl_url(
+        self,
+        url: str,
+        config: CrawlerRunConfig,
+        task_id: str,
+        monitor: Optional[CrawlerMonitor] = None,
+    ) -> CrawlerTaskResult:
+        pass
+
+    @abstractmethod
+    async def run_urls(
+        self,
+        urls: List[str],
+        crawler: "AsyncWebCrawler",  # noqa: F821
+        config: CrawlerRunConfig,
+        monitor: Optional[CrawlerMonitor] = None,
+    ) -> List[CrawlerTaskResult]:
+        pass
+
+
+class MemoryAdaptiveDispatcher(BaseDispatcher):
+    def __init__(
+        self,
+        memory_threshold_percent: float = 90.0,
+        check_interval: float = 1.0,
+        max_session_permit: int = 20,
+        memory_wait_timeout: float = 300.0,  # 5 minutes default timeout
+        rate_limiter: Optional[RateLimiter] = None,
+        monitor: Optional[CrawlerMonitor] = None,
+    ):
+        super().__init__(rate_limiter, monitor)
+        self.memory_threshold_percent = memory_threshold_percent
+        self.check_interval = check_interval
+        self.max_session_permit = max_session_permit
+        self.memory_wait_timeout = memory_wait_timeout
+
+    async def crawl_url(
+        self,
+        url: str,
+        config: CrawlerRunConfig,
+        task_id: str,
+    ) -> CrawlerTaskResult:
+        start_time = datetime.now()
+        error_message = ""
+        memory_usage = peak_memory = 0.0
+
+        try:
+            if self.monitor:
+                self.monitor.update_task(
+                    task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
+                )
+            self.concurrent_sessions += 1
+
+            if self.rate_limiter:
+                await self.rate_limiter.wait_if_needed(url)
+
+            process = psutil.Process()
+            start_memory = process.memory_info().rss / (1024 * 1024)
+            result = await self.crawler.arun(url, config=config, session_id=task_id)
+            end_memory = process.memory_info().rss / (1024 * 1024)
+
+            memory_usage = peak_memory = end_memory - start_memory
+
+            if self.rate_limiter and result.status_code:
+                if not self.rate_limiter.update_delay(url, result.status_code):
+                    error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
+                    if self.monitor:
+                        self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
+                    return CrawlerTaskResult(
+                        task_id=task_id,
+                        url=url,
+                        result=result,
+                        memory_usage=memory_usage,
+                        peak_memory=peak_memory,
+                        start_time=start_time,
+                        end_time=datetime.now(),
+                        error_message=error_message,
+                    )
+
+            if not result.success:
+                error_message = result.error_message
+                if self.monitor:
+                    self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
+            elif self.monitor:
+                self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
+
+        except Exception as e:
+            error_message = str(e)
+            if self.monitor:
+                self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
+            result = CrawlResult(
+                url=url, html="", metadata={}, success=False, error_message=str(e)
+            )
+
+        finally:
+            end_time = datetime.now()
+            if self.monitor:
+                self.monitor.update_task(
+                    task_id,
+                    end_time=end_time,
+                    memory_usage=memory_usage,
+                    peak_memory=peak_memory,
+                    error_message=error_message,
+                )
+            self.concurrent_sessions -= 1
+
+        return CrawlerTaskResult(
+            task_id=task_id,
+            url=url,
+            result=result,
+            memory_usage=memory_usage,
+            peak_memory=peak_memory,
+            start_time=start_time,
+            end_time=end_time,
+            error_message=error_message,
+        )
+
+    async def run_urls(
+        self,
+        urls: List[str],
+        crawler: "AsyncWebCrawler",  # noqa: F821
+        config: CrawlerRunConfig,
+    ) -> List[CrawlerTaskResult]:
+        self.crawler = crawler
+
+        if self.monitor:
+            self.monitor.start()
+
+        try:
+            pending_tasks = []
+            active_tasks = []
+            task_queue = []
+
+            for url in urls:
+                task_id = str(uuid.uuid4())
+                if self.monitor:
+                    self.monitor.add_task(task_id, url)
+                task_queue.append((url, task_id))
+
+            while task_queue or active_tasks:
+                wait_start_time = time.time()
+                while len(active_tasks) < self.max_session_permit and task_queue:
+                    if psutil.virtual_memory().percent >= self.memory_threshold_percent:
+                        # Check if we've exceeded the timeout
+                        if time.time() - wait_start_time > self.memory_wait_timeout:
+                            raise MemoryError(
+                                f"Memory usage above threshold ({self.memory_threshold_percent}%) for more than {self.memory_wait_timeout} seconds"
+                            )
+                        await asyncio.sleep(self.check_interval)
+                        continue
+
+                    url, task_id = task_queue.pop(0)
+                    task = asyncio.create_task(self.crawl_url(url, config, task_id))
+                    active_tasks.append(task)
+
+                if not active_tasks:
+                    await asyncio.sleep(self.check_interval)
+                    continue
+
+                done, pending = await asyncio.wait(
+                    active_tasks, return_when=asyncio.FIRST_COMPLETED
+                )
+
+                pending_tasks.extend(done)
+                active_tasks = list(pending)
+
+            return await asyncio.gather(*pending_tasks)
+        finally:
+            if self.monitor:
+                self.monitor.stop()
+
+
+class SemaphoreDispatcher(BaseDispatcher):
+    def __init__(
+        self,
+        semaphore_count: int = 5,
+        max_session_permit: int = 20,
+        rate_limiter: Optional[RateLimiter] = None,
+        monitor: Optional[CrawlerMonitor] = None,
+    ):
+        super().__init__(rate_limiter, monitor)
+        self.semaphore_count = semaphore_count
+        self.max_session_permit = max_session_permit
+
+    async def crawl_url(
+        self,
+        url: str,
+        config: CrawlerRunConfig,
+        task_id: str,
+        semaphore: asyncio.Semaphore = None,
+    ) -> CrawlerTaskResult:
+        start_time = datetime.now()
+        error_message = ""
+        memory_usage = peak_memory = 0.0
+
+        try:
+            if self.monitor:
+                self.monitor.update_task(
+                    task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
+                )
+
+            if self.rate_limiter:
+                await self.rate_limiter.wait_if_needed(url)
+
+            async with semaphore:
+                process = psutil.Process()
+                start_memory = process.memory_info().rss / (1024 * 1024)
+                result = await self.crawler.arun(url, config=config, session_id=task_id)
+                end_memory = process.memory_info().rss / (1024 * 1024)
+
+                memory_usage = peak_memory = end_memory - start_memory
+
+                if self.rate_limiter and result.status_code:
+                    if not self.rate_limiter.update_delay(url, result.status_code):
+                        error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
+                        if self.monitor:
+                            self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
+                        return CrawlerTaskResult(
+                            task_id=task_id,
+                            url=url,
+                            result=result,
+                            memory_usage=memory_usage,
+                            peak_memory=peak_memory,
+                            start_time=start_time,
+                            end_time=datetime.now(),
+                            error_message=error_message,
+                        )
+
+                if not result.success:
+                    error_message = result.error_message
+                    if self.monitor:
+                        self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
+                elif self.monitor:
+                    self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
+
+        except Exception as e:
+            error_message = str(e)
+            if self.monitor:
+                self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
+            result = CrawlResult(
+                url=url, html="", metadata={}, success=False, error_message=str(e)
+            )
+
+        finally:
+            end_time = datetime.now()
+            if self.monitor:
+                self.monitor.update_task(
+                    task_id,
+                    end_time=end_time,
+                    memory_usage=memory_usage,
+                    peak_memory=peak_memory,
+                    error_message=error_message,
+                )
+
+        return CrawlerTaskResult(
+            task_id=task_id,
+            url=url,
+            result=result,
+            memory_usage=memory_usage,
+            peak_memory=peak_memory,
+            start_time=start_time,
+            end_time=end_time,
+            error_message=error_message,
+        )
+
+    async def run_urls(
+        self,
+        crawler: "AsyncWebCrawler",  # noqa: F821
+        urls: List[str],
+        config: CrawlerRunConfig,
+    ) -> List[CrawlerTaskResult]:
+        self.crawler = crawler
+        if self.monitor:
+            self.monitor.start()
+
+        try:
+            semaphore = asyncio.Semaphore(self.semaphore_count)
+            tasks = []
+
+            for url in urls:
+                task_id = str(uuid.uuid4())
+                if self.monitor:
+                    self.monitor.add_task(task_id, url)
+                task = asyncio.create_task(
+                    self.crawl_url(url, config, task_id, semaphore)
+                )
+                tasks.append(task)
+
+            return await asyncio.gather(*tasks, return_exceptions=True)
+        finally:
+            if self.monitor:
+                self.monitor.stop()
diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py
index 5d2d54b5..0e049289 100644
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -1,10 +1,10 @@
 from enum import Enum
-from typing import Optional, Dict, Any, Union
-from colorama import Fore, Back, Style, init
-import time
+from typing import Optional, Dict, Any
+from colorama import Fore, Style, init
 import os
 from datetime import datetime
 
+
 class LogLevel(Enum):
     DEBUG = 1
     INFO = 2
@@ -12,23 +12,24 @@ class LogLevel(Enum):
     WARNING = 4
     ERROR = 5
 
+
 class AsyncLogger:
     """
     Asynchronous logger with support for colored console output and file logging.
     Supports templated messages with colored components.
     """
-    
+
     DEFAULT_ICONS = {
-        'INIT': '→',
-        'READY': '✓',
-        'FETCH': '↓',
-        'SCRAPE': '◆',
-        'EXTRACT': '■',
-        'COMPLETE': '●',
-        'ERROR': '×',
-        'DEBUG': '⋯',
-        'INFO': 'ℹ',
-        'WARNING': '⚠',
+        "INIT": "→",
+        "READY": "✓",
+        "FETCH": "↓",
+        "SCRAPE": "◆",
+        "EXTRACT": "■",
+        "COMPLETE": "●",
+        "ERROR": "×",
+        "DEBUG": "⋯",
+        "INFO": "ℹ",
+        "WARNING": "⚠",
     }
 
     DEFAULT_COLORS = {
@@ -46,11 +47,11 @@ class AsyncLogger:
         tag_width: int = 10,
         icons: Optional[Dict[str, str]] = None,
         colors: Optional[Dict[LogLevel, str]] = None,
-        verbose: bool = True
+        verbose: bool = True,
     ):
         """
         Initialize the logger.
-        
+
         Args:
             log_file: Optional file path for logging
             log_level: Minimum log level to display
@@ -66,7 +67,7 @@ class AsyncLogger:
         self.icons = icons or self.DEFAULT_ICONS
         self.colors = colors or self.DEFAULT_COLORS
         self.verbose = verbose
-        
+
         # Create log file directory if needed
         if log_file:
             os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
@@ -77,18 +78,20 @@ class AsyncLogger:
 
     def _get_icon(self, tag: str) -> str:
         """Get the icon for a tag, defaulting to info icon if not found."""
-        return self.icons.get(tag, self.icons['INFO'])
+        return self.icons.get(tag, self.icons["INFO"])
 
     def _write_to_file(self, message: str):
         """Write a message to the log file if configured."""
         if self.log_file:
-            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
-            with open(self.log_file, 'a', encoding='utf-8') as f:
+            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+            with open(self.log_file, "a", encoding="utf-8") as f:
                 # Strip ANSI color codes for file output
-                clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '')
+                clean_message = message.replace(Fore.RESET, "").replace(
+                    Style.RESET_ALL, ""
+                )
                 for color in vars(Fore).values():
                     if isinstance(color, str):
-                        clean_message = clean_message.replace(color, '')
+                        clean_message = clean_message.replace(color, "")
                 f.write(f"[{timestamp}] {clean_message}\n")
 
     def _log(
@@ -99,11 +102,11 @@ class AsyncLogger:
         params: Optional[Dict[str, Any]] = None,
         colors: Optional[Dict[str, str]] = None,
         base_color: Optional[str] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Core logging method that handles message formatting and output.
-        
+
         Args:
             level: Log level for this message
             message: Message template string
@@ -120,7 +123,7 @@ class AsyncLogger:
             try:
                 # First format the message with raw parameters
                 formatted_message = message.format(**params)
-                
+
                 # Then apply colors if specified
                 if colors:
                     for key, color in colors.items():
@@ -128,12 +131,13 @@ class AsyncLogger:
                         if key in params:
                             value_str = str(params[key])
                             formatted_message = formatted_message.replace(
-                                value_str, 
-                                f"{color}{value_str}{Style.RESET_ALL}"
+                                value_str, f"{color}{value_str}{Style.RESET_ALL}"
                             )
-                            
+
             except KeyError as e:
-                formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template"
+                formatted_message = (
+                    f"LOGGING ERROR: Missing parameter {e} in message template"
+                )
                 level = LogLevel.ERROR
         else:
             formatted_message = message
@@ -175,11 +179,11 @@ class AsyncLogger:
         success: bool,
         timing: float,
         tag: str = "FETCH",
-        url_length: int = 50
+        url_length: int = 50,
     ):
         """
         Convenience method for logging URL fetch status.
-        
+
         Args:
             url: The URL being processed
             success: Whether the operation was successful
@@ -195,24 +199,20 @@ class AsyncLogger:
                 "url": url,
                 "url_length": url_length,
                 "status": success,
-                "timing": timing
+                "timing": timing,
             },
             colors={
                 "status": Fore.GREEN if success else Fore.RED,
-                "timing": Fore.YELLOW
-            }
+                "timing": Fore.YELLOW,
+            },
         )
 
     def error_status(
-        self,
-        url: str,
-        error: str,
-        tag: str = "ERROR",
-        url_length: int = 50
+        self, url: str, error: str, tag: str = "ERROR", url_length: int = 50
     ):
         """
         Convenience method for logging error status.
-        
+
         Args:
             url: The URL being processed
             error: Error message
@@ -223,9 +223,5 @@ class AsyncLogger:
             level=LogLevel.ERROR,
             message="{url:.{url_length}}... | Error: {error}",
             tag=tag,
-            params={
-                "url": url,
-                "url_length": url_length,
-                "error": error
-            }
-        )
\ No newline at end of file
+            params={"url": url, "url_length": url_length, "error": error},
+        )
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 6ed8ec8f..61cfc18f 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -1,47 +1,60 @@
-import os, sys
+import os
+import sys
 import time
 import warnings
-from enum import Enum
-from colorama import init, Fore, Back, Style
+from colorama import Fore
 from pathlib import Path
-from typing import Optional, List, Union
+from typing import Optional, List
 import json
 import asyncio
+
 # from contextlib import nullcontext, asynccontextmanager
 from contextlib import asynccontextmanager
-from .models import CrawlResult, MarkdownGenerationResult
+from .models import CrawlResult, MarkdownGenerationResult, CrawlerTaskResult, DispatchResult
 from .async_database import async_db_manager
-from .chunking_strategy import *
-from .content_filter_strategy import *
-from .extraction_strategy import *
-from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
+from .chunking_strategy import *  # noqa: F403
+from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking
+from .content_filter_strategy import *  # noqa: F403
+from .content_filter_strategy import RelevantContentFilter
+from .extraction_strategy import * # noqa: F403
+from .extraction_strategy import NoExtractionStrategy, ExtractionStrategy
+from .async_crawler_strategy import (
+    AsyncCrawlerStrategy,
+    AsyncPlaywrightCrawlerStrategy,
+    AsyncCrawlResponse,
+)
 from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
-from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
-from .content_scraping_strategy import WebScrapingStrategy
+from .markdown_generation_strategy import (
+    DefaultMarkdownGenerator,
+    MarkdownGenerationStrategy,
+)
 from .async_logger import AsyncLogger
 from .async_configs import BrowserConfig, CrawlerRunConfig
-from .config import (
-    MIN_WORD_THRESHOLD, 
-    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
-    URL_LOG_SHORTEN_LENGTH
-)
+from .async_dispatcher import * # noqa: F403
+from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
+
+from .config import MIN_WORD_THRESHOLD
 from .utils import (
     sanitize_input_encode,
     InvalidCSSSelectorError,
-    format_html,
     fast_format_html,
-    create_box_message
+    create_box_message,
+    get_error_context,
 )
 
-from urllib.parse import urlparse
-import random
+from typing import Union, AsyncGenerator, List, TypeVar
+from collections.abc import AsyncGenerator
+
+CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
+RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
+
 from .__version__ import __version__ as crawl4ai_version
 
 
 class AsyncWebCrawler:
     """
     Asynchronous web crawler with flexible caching capabilities.
-    
+
     There are two ways to use the crawler:
 
     1. Using context manager (recommended for simple cases):
@@ -54,23 +67,23 @@ class AsyncWebCrawler:
         ```python
         crawler = AsyncWebCrawler()
         await crawler.start()
-        
+
         # Use the crawler multiple times
         result1 = await crawler.arun(url="https://example.com")
         result2 = await crawler.arun(url="https://another.com")
-        
+
         await crawler.close()
         ```
-    
+
     Migration Guide:
     Old way (deprecated):
         crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True)
-    
+
     New way (recommended):
         browser_config = BrowserConfig(browser_type="chromium", headless=True)
         crawler = AsyncWebCrawler(config=browser_config)
-    
-    
+
+
     Attributes:
         browser_config (BrowserConfig): Configuration object for browser settings.
         crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
@@ -79,7 +92,7 @@ class AsyncWebCrawler:
         crawl4ai_folder (str): Directory for storing cache.
         base_directory (str): Base directory for storing cache.
         ready (bool): Whether the crawler is ready for use.
-        
+
         Methods:
             start(): Start the crawler explicitly without using context manager.
             close(): Close the crawler explicitly without using context manager.
@@ -87,21 +100,22 @@ class AsyncWebCrawler:
             awarmup(): Perform warmup sequence.
             arun_many(): Run the crawler for multiple sources.
             aprocess_html(): Process HTML content.
-    
+
     Typical Usage:
         async with AsyncWebCrawler() as crawler:
             result = await crawler.arun(url="https://example.com")
             print(result.markdown)
-            
+
         Using configuration:
         browser_config = BrowserConfig(browser_type="chromium", headless=True)
         async with AsyncWebCrawler(config=browser_config) as crawler:
             crawler_config = CrawlerRunConfig(
-                cache_mode=CacheMode.BYPASS                
+                cache_mode=CacheMode.BYPASS
             )
             result = await crawler.arun(url="https://example.com", config=crawler_config)
             print(result.markdown)
     """
+
     _domain_last_hit = {}
 
     def __init__(
@@ -125,43 +139,48 @@ class AsyncWebCrawler:
             base_directory: Base directory for storing cache
             thread_safe: Whether to use thread-safe operations
             **kwargs: Additional arguments for backwards compatibility
-        """  
+        """
         # Handle browser configuration
         browser_config = config
         if browser_config is not None:
-            if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]):
+            if any(
+                k in kwargs
+                for k in [
+                    "browser_type",
+                    "headless",
+                    "viewport_width",
+                    "viewport_height",
+                ]
+            ):
                 self.logger.warning(
                     message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
-                    tag="WARNING"
+                    tag="WARNING",
                 )
         else:
             # Create browser config from kwargs for backwards compatibility
             browser_config = BrowserConfig.from_kwargs(kwargs)
 
         self.browser_config = browser_config
-        
+
         # Initialize logger first since other components may need it
         self.logger = AsyncLogger(
             log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
-            verbose=self.browser_config.verbose,    
-            tag_width=10
+            verbose=self.browser_config.verbose,
+            tag_width=10,
         )
 
-        
         # Initialize crawler strategy
-        params = {
-            k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger']
-        }
+        params = {k: v for k, v in kwargs.items() if k in ["browser_congig", "logger"]}
         self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
             browser_config=browser_config,
             logger=self.logger,
-            **params  # Pass remaining kwargs for backwards compatibility
+            **params,  # Pass remaining kwargs for backwards compatibility
         )
-        
+
         # If craweler strategy doesnt have logger, use crawler logger
         if not self.crawler_strategy.logger:
             self.crawler_strategy.logger = self.logger
-        
+
         # Handle deprecated cache parameter
         if always_by_pass_cache is not None:
             if kwargs.get("warning", True):
@@ -170,7 +189,7 @@ class AsyncWebCrawler:
                     "Use 'always_bypass_cache' instead. "
                     "Pass warning=False to suppress this warning.",
                     DeprecationWarning,
-                    stacklevel=2
+                    stacklevel=2,
                 )
             self.always_bypass_cache = always_by_pass_cache
         else:
@@ -178,24 +197,24 @@ class AsyncWebCrawler:
 
         # Thread safety setup
         self._lock = asyncio.Lock() if thread_safe else None
-        
+
         # Initialize directories
         self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
         os.makedirs(self.crawl4ai_folder, exist_ok=True)
         os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
-        
+
         self.ready = False
 
     async def start(self):
         """
         Start the crawler explicitly without using context manager.
         This is equivalent to using 'async with' but gives more control over the lifecycle.
-        
+
         This method will:
         1. Initialize the browser and context
         2. Perform warmup sequence
         3. Return the crawler instance for method chaining
-        
+
         Returns:
             AsyncWebCrawler: The initialized crawler instance
         """
@@ -207,7 +226,7 @@ class AsyncWebCrawler:
         """
         Close the crawler explicitly without using context manager.
         This should be called when you're done with the crawler if you used start().
-        
+
         This method will:
         1. Clean up browser resources
         2. Close any open pages and contexts
@@ -219,11 +238,11 @@ class AsyncWebCrawler:
 
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         await self.close()
-    
+
     async def awarmup(self):
         """
         Initialize the crawler with warm-up sequence.
-        
+
         This method:
         1. Logs initialization info
         2. Sets up browser configuration
@@ -238,587 +257,541 @@ class AsyncWebCrawler:
         yield
 
     async def arun(
-            self,
-            url: str,
-            config: Optional[CrawlerRunConfig] = None,
-            # Legacy parameters maintained for backwards compatibility
-            word_count_threshold=MIN_WORD_THRESHOLD,
-            extraction_strategy: ExtractionStrategy = None,
-            chunking_strategy: ChunkingStrategy = RegexChunking(),
-            content_filter: RelevantContentFilter = None,
-            cache_mode: Optional[CacheMode] = None,
-            # Deprecated cache parameters
-            bypass_cache: bool = False,
-            disable_cache: bool = False,
-            no_cache_read: bool = False,
-            no_cache_write: bool = False,
-            # Other legacy parameters
-            css_selector: str = None,
-            screenshot: bool = False,
-            pdf: bool = False,
-            user_agent: str = None,
-            verbose=True,
-            **kwargs,
-        ) -> CrawlResult:
-            """
-            Runs the crawler for a single source: URL (web, local file, or raw HTML).
+        self,
+        url: str,
+        config: Optional[CrawlerRunConfig] = None,
+        # Legacy parameters maintained for backwards compatibility
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        content_filter: RelevantContentFilter = None,
+        cache_mode: Optional[CacheMode] = None,
+        # Deprecated cache parameters
+        bypass_cache: bool = False,
+        disable_cache: bool = False,
+        no_cache_read: bool = False,
+        no_cache_write: bool = False,
+        # Other legacy parameters
+        css_selector: str = None,
+        screenshot: bool = False,
+        pdf: bool = False,
+        user_agent: str = None,
+        verbose=True,
+        **kwargs,
+    ) -> CrawlResult:
+        """
+        Runs the crawler for a single source: URL (web, local file, or raw HTML).
 
-            Migration Guide:
-            Old way (deprecated):
-                result = await crawler.arun(
-                    url="https://example.com",
-                    word_count_threshold=200,
-                    screenshot=True,
-                    ...
-                )
+        Migration Guide:
+        Old way (deprecated):
+            result = await crawler.arun(
+                url="https://example.com",
+                word_count_threshold=200,
+                screenshot=True,
+                ...
+            )
 
-            New way (recommended):
-                config = CrawlerRunConfig(
-                    word_count_threshold=200,
-                    screenshot=True,
-                    ...
-                )
-                result = await crawler.arun(url="https://example.com", crawler_config=config)
+        New way (recommended):
+            config = CrawlerRunConfig(
+                word_count_threshold=200,
+                screenshot=True,
+                ...
+            )
+            result = await crawler.arun(url="https://example.com", crawler_config=config)
 
-            Args:
-                url: The URL to crawl (http://, https://, file://, or raw:)
-                crawler_config: Configuration object controlling crawl behavior
-                [other parameters maintained for backwards compatibility]
+        Args:
+            url: The URL to crawl (http://, https://, file://, or raw:)
+            crawler_config: Configuration object controlling crawl behavior
+            [other parameters maintained for backwards compatibility]
 
-            Returns:
-                CrawlResult: The result of crawling and processing
-            """
-            crawler_config = config
-            if not isinstance(url, str) or not url:
-                raise ValueError("Invalid URL, make sure the URL is a non-empty string")
-            
-            async with self._lock or self.nullcontext():
-                try:
-                    # Handle configuration
-                    if crawler_config is not None:
-                        # if any(param is not None for param in [
-                        #     word_count_threshold, extraction_strategy, chunking_strategy,
-                        #     content_filter, cache_mode, css_selector, screenshot, pdf
-                        # ]):
-                        #     self.logger.warning(
-                        #         message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
-                        #         tag="WARNING"
-                        #     )
-                        config = crawler_config
-                    else:
-                        # Merge all parameters into a single kwargs dict for config creation
-                        config_kwargs = {
-                            "word_count_threshold": word_count_threshold,
-                            "extraction_strategy": extraction_strategy,
-                            "chunking_strategy": chunking_strategy,
-                            "content_filter": content_filter,
-                            "cache_mode": cache_mode,
-                            "bypass_cache": bypass_cache,
-                            "disable_cache": disable_cache,
-                            "no_cache_read": no_cache_read,
-                            "no_cache_write": no_cache_write,
-                            "css_selector": css_selector,
-                            "screenshot": screenshot,
-                            "pdf": pdf,
-                            "verbose": verbose,
-                            **kwargs
-                        }
-                        config = CrawlerRunConfig.from_kwargs(config_kwargs)
+        Returns:
+            CrawlResult: The result of crawling and processing
+        """
+        crawler_config = config
+        if not isinstance(url, str) or not url:
+            raise ValueError("Invalid URL, make sure the URL is a non-empty string")
 
-                    # Handle deprecated cache parameters
-                    if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
-                        if kwargs.get("warning", True):
-                            warnings.warn(
-                                "Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
-                                "Use 'cache_mode' parameter instead.",
-                                DeprecationWarning,
-                                stacklevel=2
-                            )
-                        
-                        # Convert legacy parameters if cache_mode not provided
-                        if config.cache_mode is None:
-                            config.cache_mode = _legacy_to_cache_mode(
-                                disable_cache=disable_cache,
-                                bypass_cache=bypass_cache,
-                                no_cache_read=no_cache_read,
-                                no_cache_write=no_cache_write
-                            )
-                    
-                    # Default to ENABLED if no cache mode specified
+        async with self._lock or self.nullcontext():
+            try:
+                # Handle configuration
+                if crawler_config is not None:
+                    # if any(param is not None for param in [
+                    #     word_count_threshold, extraction_strategy, chunking_strategy,
+                    #     content_filter, cache_mode, css_selector, screenshot, pdf
+                    # ]):
+                    #     self.logger.warning(
+                    #         message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
+                    #         tag="WARNING"
+                    #     )
+                    config = crawler_config
+                else:
+                    # Merge all parameters into a single kwargs dict for config creation
+                    config_kwargs = {
+                        "word_count_threshold": word_count_threshold,
+                        "extraction_strategy": extraction_strategy,
+                        "chunking_strategy": chunking_strategy,
+                        "content_filter": content_filter,
+                        "cache_mode": cache_mode,
+                        "bypass_cache": bypass_cache,
+                        "disable_cache": disable_cache,
+                        "no_cache_read": no_cache_read,
+                        "no_cache_write": no_cache_write,
+                        "css_selector": css_selector,
+                        "screenshot": screenshot,
+                        "pdf": pdf,
+                        "verbose": verbose,
+                        **kwargs,
+                    }
+                    config = CrawlerRunConfig.from_kwargs(config_kwargs)
+
+                # Handle deprecated cache parameters
+                if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
+                    if kwargs.get("warning", True):
+                        warnings.warn(
+                            "Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
+                            "Use 'cache_mode' parameter instead.",
+                            DeprecationWarning,
+                            stacklevel=2,
+                        )
+
+                    # Convert legacy parameters if cache_mode not provided
                     if config.cache_mode is None:
-                        config.cache_mode = CacheMode.ENABLED
-
-                    # Create cache context
-                    cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache)
-
-                    # Initialize processing variables
-                    async_response: AsyncCrawlResponse = None
-                    cached_result: CrawlResult = None
-                    screenshot_data = None
-                    pdf_data = None
-                    extracted_content = None
-                    start_time = time.perf_counter()
-
-                    # Try to get cached result if appropriate
-                    if cache_context.should_read():
-                        cached_result = await async_db_manager.aget_cached_url(url)
-
-                    if cached_result:
-                        html = sanitize_input_encode(cached_result.html)
-                        extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
-                        extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
-                        # If screenshot is requested but its not in cache, then set cache_result to None
-                        screenshot_data = cached_result.screenshot
-                        pdf_data = cached_result.pdf
-                        if config.screenshot and not screenshot or config.pdf and not pdf:
-                            cached_result = None
-
-                        self.logger.url_status(
-                            url=cache_context.display_url,
-                            success=bool(html),
-                            timing=time.perf_counter() - start_time,
-                            tag="FETCH"
+                        config.cache_mode = _legacy_to_cache_mode(
+                            disable_cache=disable_cache,
+                            bypass_cache=bypass_cache,
+                            no_cache_read=no_cache_read,
+                            no_cache_write=no_cache_write,
                         )
 
-                    # Fetch fresh content if needed
-                    if not cached_result or not html:
-                        t1 = time.perf_counter()
-                        
-                        if user_agent:
-                            self.crawler_strategy.update_user_agent(user_agent)
-                        
-                        # Pass config to crawl method
-                        async_response = await self.crawler_strategy.crawl(
-                            url,
-                            config=config  # Pass the entire config object
-                        )
-                        
-                        html = sanitize_input_encode(async_response.html)
-                        screenshot_data = async_response.screenshot
-                        pdf_data = async_response.pdf_data
-                        
-                        t2 = time.perf_counter()
-                        self.logger.url_status(
-                            url=cache_context.display_url,
-                            success=bool(html),
-                            timing=t2 - t1,
-                            tag="FETCH"
-                        )
+                # Default to ENABLED if no cache mode specified
+                if config.cache_mode is None:
+                    config.cache_mode = CacheMode.ENABLED
 
-                        # Process the HTML content
-                        crawl_result = await self.aprocess_html(
-                            url=url,
-                            html=html,
-                            extracted_content=extracted_content,
-                            config=config,  # Pass the config object instead of individual parameters
-                            screenshot=screenshot_data,
-                            pdf_data=pdf_data,
-                            verbose=config.verbose,
-                            is_raw_html = True if url.startswith("raw:") else False,
-                            **kwargs
-                        )
+                # Create cache context
+                cache_context = CacheContext(
+                    url, config.cache_mode, self.always_bypass_cache
+                )
 
-                        crawl_result.status_code = async_response.status_code
-                        crawl_result.response_headers = async_response.response_headers
-                        crawl_result.downloaded_files = async_response.downloaded_files
-                        crawl_result.ssl_certificate = async_response.ssl_certificate  # Add SSL certificate
+                # Initialize processing variables
+                async_response: AsyncCrawlResponse = None
+                cached_result: CrawlResult = None
+                screenshot_data = None
+                pdf_data = None
+                extracted_content = None
+                start_time = time.perf_counter()
 
-                        # # Check and set values from async_response to crawl_result
-                        # try:
-                        #     for key in vars(async_response):
-                        #         if hasattr(crawl_result, key):
-                        #             value = getattr(async_response, key, None)
-                        #             current_value = getattr(crawl_result, key, None)
-                        #             if value is not None and not current_value:
-                        #                 try:
-                        #                     setattr(crawl_result, key, value)
-                        #                 except Exception as e:
-                        #                     self.logger.warning(
-                        #                         message=f"Failed to set attribute {key}: {str(e)}",
-                        #                         tag="WARNING"
-                        #                     )
-                        # except Exception as e:
-                        #     self.logger.warning(
-                        #         message=f"Error copying response attributes: {str(e)}",
-                        #         tag="WARNING"
-                        #     )
+                # Try to get cached result if appropriate
+                if cache_context.should_read():
+                    cached_result = await async_db_manager.aget_cached_url(url)
 
-                        crawl_result.success = bool(html)
-                        crawl_result.session_id = getattr(config, 'session_id', None)
-
-                        self.logger.success(
-                            message="{url:.50}... | Status: {status} | Total: {timing}",
-                            tag="COMPLETE",
-                            params={
-                                "url": cache_context.display_url,
-                                "status": crawl_result.success,
-                                "timing": f"{time.perf_counter() - start_time:.2f}s"
-                            },
-                            colors={
-                                "status": Fore.GREEN if crawl_result.success else Fore.RED,
-                                "timing": Fore.YELLOW
-                            }
-                        )
-
-                        # Update cache if appropriate
-                        if cache_context.should_write() and not bool(cached_result):
-                            await async_db_manager.acache_url(crawl_result)
-
-                        return crawl_result
-
-                    else:
-                        self.logger.success(
-                            message="{url:.50}... | Status: {status} | Total: {timing}",
-                            tag="COMPLETE",
-                            params={
-                                "url": cache_context.display_url,
-                                "status": True,
-                                "timing": f"{time.perf_counter() - start_time:.2f}s"
-                            },
-                            colors={
-                                "status": Fore.GREEN,
-                                "timing": Fore.YELLOW
-                            }
-                        )
-
-                        cached_result.success = bool(html)
-                        cached_result.session_id = getattr(config, 'session_id', None)
-                        return cached_result
-
-                except Exception as e:
-                    error_context = get_error_context(sys.exc_info())
-                
-                    error_message = (
-                        f"Unexpected error in _crawl_web at line {error_context['line_no']} "
-                        f"in {error_context['function']} ({error_context['filename']}):\n"
-                        f"Error: {str(e)}\n\n"
-                        f"Code context:\n{error_context['code_context']}"
+                if cached_result:
+                    html = sanitize_input_encode(cached_result.html)
+                    extracted_content = sanitize_input_encode(
+                        cached_result.extracted_content or ""
                     )
-                    # if not hasattr(e, "msg"):
-                    #     e.msg = str(e)
-                    
-                    self.logger.error_status(
+                    extracted_content = (
+                        None
+                        if not extracted_content or extracted_content == "[]"
+                        else extracted_content
+                    )
+                    # If screenshot is requested but its not in cache, then set cache_result to None
+                    screenshot_data = cached_result.screenshot
+                    pdf_data = cached_result.pdf
+                    if config.screenshot and not screenshot or config.pdf and not pdf:
+                        cached_result = None
+
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=bool(html),
+                        timing=time.perf_counter() - start_time,
+                        tag="FETCH",
+                    )
+
+                # Fetch fresh content if needed
+                if not cached_result or not html:
+                    t1 = time.perf_counter()
+
+                    if user_agent:
+                        self.crawler_strategy.update_user_agent(user_agent)
+
+                    # Pass config to crawl method
+                    async_response = await self.crawler_strategy.crawl(
+                        url,
+                        config=config,  # Pass the entire config object
+                    )
+
+                    html = sanitize_input_encode(async_response.html)
+                    screenshot_data = async_response.screenshot
+                    pdf_data = async_response.pdf_data
+
+                    t2 = time.perf_counter()
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=bool(html),
+                        timing=t2 - t1,
+                        tag="FETCH",
+                    )
+
+                    # Process the HTML content
+                    crawl_result : CrawlResult = await self.aprocess_html(
                         url=url,
-                        error=create_box_message(error_message, type="error"),
-                        tag="ERROR"
+                        html=html,
+                        extracted_content=extracted_content,
+                        config=config,  # Pass the config object instead of individual parameters
+                        screenshot=screenshot_data,
+                        pdf_data=pdf_data,
+                        verbose=config.verbose,
+                        is_raw_html=True if url.startswith("raw:") else False,
+                        **kwargs,
                     )
-                    
-                    return CrawlResult(
-                        url=url,
-                        html="",
-                        success=False,
-                        error_message=error_message
+
+                    crawl_result.status_code = async_response.status_code
+                    crawl_result.redirected_url = async_response.final_url or url
+                    crawl_result.response_headers = async_response.response_headers
+                    crawl_result.downloaded_files = async_response.downloaded_files
+                    crawl_result.ssl_certificate = (
+                        async_response.ssl_certificate
+                    )  # Add SSL certificate
+
+                    # # Check and set values from async_response to crawl_result
+                    # try:
+                    #     for key in vars(async_response):
+                    #         if hasattr(crawl_result, key):
+                    #             value = getattr(async_response, key, None)
+                    #             current_value = getattr(crawl_result, key, None)
+                    #             if value is not None and not current_value:
+                    #                 try:
+                    #                     setattr(crawl_result, key, value)
+                    #                 except Exception as e:
+                    #                     self.logger.warning(
+                    #                         message=f"Failed to set attribute {key}: {str(e)}",
+                    #                         tag="WARNING"
+                    #                     )
+                    # except Exception as e:
+                    #     self.logger.warning(
+                    #         message=f"Error copying response attributes: {str(e)}",
+                    #         tag="WARNING"
+                    #     )
+
+                    crawl_result.success = bool(html)
+                    crawl_result.session_id = getattr(config, "session_id", None)
+
+                    self.logger.success(
+                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                        tag="COMPLETE",
+                        params={
+                            "url": cache_context.display_url,
+                            "status": crawl_result.success,
+                            "timing": f"{time.perf_counter() - start_time:.2f}s",
+                        },
+                        colors={
+                            "status": Fore.GREEN if crawl_result.success else Fore.RED,
+                            "timing": Fore.YELLOW,
+                        },
                     )
 
+                    # Update cache if appropriate
+                    if cache_context.should_write() and not bool(cached_result):
+                        await async_db_manager.acache_url(crawl_result)
+
+                    return crawl_result
+
+                else:
+                    self.logger.success(
+                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                        tag="COMPLETE",
+                        params={
+                            "url": cache_context.display_url,
+                            "status": True,
+                            "timing": f"{time.perf_counter() - start_time:.2f}s",
+                        },
+                        colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
+                    )
+
+                    cached_result.success = bool(html)
+                    cached_result.session_id = getattr(config, "session_id", None)
+                    cached_result.redirected_url = cached_result.redirected_url or url
+                    return cached_result
+
+            except Exception as e:
+                error_context = get_error_context(sys.exc_info())
+
+                error_message = (
+                    f"Unexpected error in _crawl_web at line {error_context['line_no']} "
+                    f"in {error_context['function']} ({error_context['filename']}):\n"
+                    f"Error: {str(e)}\n\n"
+                    f"Code context:\n{error_context['code_context']}"
+                )
+                # if not hasattr(e, "msg"):
+                #     e.msg = str(e)
+
+                self.logger.error_status(
+                    url=url,
+                    error=create_box_message(error_message, type="error"),
+                    tag="ERROR",
+                )
+
+                return CrawlResult(
+                    url=url, html="", success=False, error_message=error_message
+                )
+
     async def aprocess_html(
-            self,
-            url: str,
-            html: str,
-            extracted_content: str,
-            config: CrawlerRunConfig,
-            screenshot: str,
-            pdf_data: str,
-            verbose: bool,
-            **kwargs,
-        ) -> CrawlResult:
-            """
-            Process HTML content using the provided configuration.
-            
-            Args:
-                url: The URL being processed
-                html: Raw HTML content
-                extracted_content: Previously extracted content (if any)
-                config: Configuration object controlling processing behavior
-                screenshot: Screenshot data (if any)
-                pdf_data: PDF data (if any)
-                verbose: Whether to enable verbose logging
-                **kwargs: Additional parameters for backwards compatibility
-            
-            Returns:
-                CrawlResult: Processed result containing extracted and formatted content
-            """
-            try:
-                _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
-                t1 = time.perf_counter()
+        self,
+        url: str,
+        html: str,
+        extracted_content: str,
+        config: CrawlerRunConfig,
+        screenshot: str,
+        pdf_data: str,
+        verbose: bool,
+        **kwargs,
+    ) -> CrawlResult:
+        """
+        Process HTML content using the provided configuration.
 
-                # Initialize scraping strategy
-                scrapping_strategy = WebScrapingStrategy(logger=self.logger)
+        Args:
+            url: The URL being processed
+            html: Raw HTML content
+            extracted_content: Previously extracted content (if any)
+            config: Configuration object controlling processing behavior
+            screenshot: Screenshot data (if any)
+            pdf_data: PDF data (if any)
+            verbose: Whether to enable verbose logging
+            **kwargs: Additional parameters for backwards compatibility
 
-                # Process HTML content
-                params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
-                # add keys from kwargs to params that doesn't exist in params
-                params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
-                
-                result = scrapping_strategy.scrap(
-                    url,
-                    html,
-                    **params,
-                    # word_count_threshold=config.word_count_threshold,
-                    # css_selector=config.css_selector,
-                    # only_text=config.only_text,
-                    # image_description_min_word_threshold=config.image_description_min_word_threshold,
-                    # content_filter=config.content_filter,
-                    # **kwargs
+        Returns:
+            CrawlResult: Processed result containing extracted and formatted content
+        """
+        try:
+            _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
+            t1 = time.perf_counter()
+
+            # Get scraping strategy and ensure it has a logger
+            scraping_strategy = config.scraping_strategy
+            if not scraping_strategy.logger:
+                scraping_strategy.logger = self.logger
+
+            # Process HTML content
+            params = {k: v for k, v in config.to_dict().items() if k not in ["url"]}
+            # add keys from kwargs to params that doesn't exist in params
+            params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
+
+            result = scraping_strategy.scrap(url, html, **params)
+
+            if result is None:
+                raise ValueError(
+                    f"Process HTML, Failed to extract content from the website: {url}"
                 )
 
-                if result is None:
-                    raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
+        except InvalidCSSSelectorError as e:
+            raise ValueError(str(e))
+        except Exception as e:
+            raise ValueError(
+                f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}"
+            )
 
-            except InvalidCSSSelectorError as e:
-                raise ValueError(str(e))
-            except Exception as e:
-                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
-
-       
-
-            # Extract results
+        # Extract results - handle both dict and ScrapingResult
+        if isinstance(result, dict):
             cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
-            fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
-            fit_html = sanitize_input_encode(result.get("fit_html", ""))
-            media = result.get("media", [])
-            links = result.get("links", [])
+            media = result.get("media", {})
+            links = result.get("links", {})
             metadata = result.get("metadata", {})
+        else:
+            cleaned_html = sanitize_input_encode(result.cleaned_html)
+            media = result.media.model_dump()
+            links = result.links.model_dump()
+            metadata = result.metadata
 
-            # Markdown Generation
-            markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
-            
-            # Uncomment if by default we want to use PruningContentFilter
-            # if not config.content_filter and not markdown_generator.content_filter:
-            #     markdown_generator.content_filter = PruningContentFilter()
-            
-            markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
+        # Markdown Generation
+        markdown_generator: Optional[MarkdownGenerationStrategy] = (
+            config.markdown_generator or DefaultMarkdownGenerator()
+        )
+
+        # Uncomment if by default we want to use PruningContentFilter
+        # if not config.content_filter and not markdown_generator.content_filter:
+        #     markdown_generator.content_filter = PruningContentFilter()
+
+        markdown_result: MarkdownGenerationResult = (
+            markdown_generator.generate_markdown(
                 cleaned_html=cleaned_html,
                 base_url=url,
                 # html2text_options=kwargs.get('html2text', {})
             )
-            markdown_v2 = markdown_result
-            markdown = sanitize_input_encode(markdown_result.raw_markdown)
+        )
+        markdown_v2 = markdown_result
+        markdown = sanitize_input_encode(markdown_result.raw_markdown)
 
-            # Log processing completion
-            self.logger.info(
-                message="Processed {url:.50}... | Time: {timing}ms",
-                tag="SCRAPE",
-                params={
-                    "url": _url,
-                    "timing": int((time.perf_counter() - t1) * 1000)
-                }
+        # Log processing completion
+        self.logger.info(
+            message="Processed {url:.50}... | Time: {timing}ms",
+            tag="SCRAPE",
+            params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000)},
+        )
+
+        # Handle content extraction if needed
+        if (
+            not bool(extracted_content)
+            and config.extraction_strategy
+            and not isinstance(config.extraction_strategy, NoExtractionStrategy)
+        ):
+            t1 = time.perf_counter()
+
+            # Choose content based on input_format
+            content_format = config.extraction_strategy.input_format
+            if content_format == "fit_markdown" and not markdown_result.fit_markdown:
+                self.logger.warning(
+                    message="Fit markdown requested but not available. Falling back to raw markdown.",
+                    tag="EXTRACT",
+                    params={"url": _url},
+                )
+                content_format = "markdown"
+
+            content = {
+                "markdown": markdown,
+                "html": html,
+                "fit_markdown": markdown_result.raw_markdown,
+            }.get(content_format, markdown)
+
+            # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
+            chunking = (
+                IdentityChunking()
+                if content_format == "html"
+                else config.chunking_strategy
+            )
+            sections = chunking.chunk(content)
+            extracted_content = config.extraction_strategy.run(url, sections)
+            extracted_content = json.dumps(
+                extracted_content, indent=4, default=str, ensure_ascii=False
             )
 
-            # Handle content extraction if needed
-            if (extracted_content is None and 
-                config.extraction_strategy and 
-                config.chunking_strategy and 
-                not isinstance(config.extraction_strategy, NoExtractionStrategy)):
-                
-                t1 = time.perf_counter()
-                
-                # Choose content based on input_format
-                content_format = config.extraction_strategy.input_format
-                if content_format == "fit_markdown" and not markdown_result.fit_markdown:
-                    self.logger.warning(
-                        message="Fit markdown requested but not available. Falling back to raw markdown.",
-                        tag="EXTRACT",
-                        params={"url": _url}
-                    )
-                    content_format = "markdown"
+            # Log extraction completion
+            self.logger.info(
+                message="Completed for {url:.50}... | Time: {timing}s",
+                tag="EXTRACT",
+                params={"url": _url, "timing": time.perf_counter() - t1},
+            )
 
-                content = {
-                    "markdown": markdown,
-                    "html": html,
-                    "fit_markdown": markdown_result.raw_markdown
-                }.get(content_format, markdown)
-                
-                # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
-                chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy
-                sections = chunking.chunk(content)
-                extracted_content = config.extraction_strategy.run(url, sections)
-                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
+        # Handle screenshot and PDF data
+        screenshot_data = None if not screenshot else screenshot
+        pdf_data = None if not pdf_data else pdf_data
 
-                # Log extraction completion
-                self.logger.info(
-                    message="Completed for {url:.50}... | Time: {timing}s",
-                    tag="EXTRACT",
-                    params={
-                        "url": _url,
-                        "timing": time.perf_counter() - t1
-                    }
-                )
+        # Apply HTML formatting if requested
+        if config.prettiify:
+            cleaned_html = fast_format_html(cleaned_html)
 
-            # Handle screenshot and PDF data
-            screenshot_data = None if not screenshot else screenshot
-            pdf_data = None if not pdf_data else pdf_data
-
-            # Apply HTML formatting if requested
-            if config.prettiify:
-                cleaned_html = fast_format_html(cleaned_html)
-
-            # Return complete crawl result
-            return CrawlResult(
-                url=url,
-                html=html,
-                cleaned_html=cleaned_html,
-                markdown_v2=markdown_v2,
-                markdown=markdown,
-                fit_markdown=fit_markdown,
-                fit_html=fit_html,
-                media=media,
-                links=links,
-                metadata=metadata,
-                screenshot=screenshot_data,
-                pdf=pdf_data,
-                extracted_content=extracted_content,
-                success=True,
-                error_message="",
-            )    
+        # Return complete crawl result
+        return CrawlResult(
+            url=url,
+            html=html,
+            cleaned_html=cleaned_html,
+            markdown_v2=markdown_v2,
+            markdown=markdown,
+            fit_markdown=markdown_result.fit_markdown,
+            fit_html=markdown_result.fit_html,
+            media=media,
+            links=links,
+            metadata=metadata,
+            screenshot=screenshot_data,
+            pdf=pdf_data,
+            extracted_content=extracted_content,
+            success=True,
+            error_message="",
+        )
 
     async def arun_many(
-            self,
-            urls: List[str],
-            config: Optional[CrawlerRunConfig] = None,
-            # Legacy parameters maintained for backwards compatibility
-            word_count_threshold=MIN_WORD_THRESHOLD,
-            extraction_strategy: ExtractionStrategy = None,
-            chunking_strategy: ChunkingStrategy = RegexChunking(),
-            content_filter: RelevantContentFilter = None,
-            cache_mode: Optional[CacheMode] = None,
-            bypass_cache: bool = False,
-            css_selector: str = None,
-            screenshot: bool = False,
-            pdf: bool = False,
-            user_agent: str = None,
-            verbose=True,
-            **kwargs,
-        ) -> List[CrawlResult]:
-            """
-            Runs the crawler for multiple URLs concurrently.
+        self,
+        urls: List[str],
+        config: Optional[CrawlerRunConfig] = None, 
+        dispatcher: Optional[BaseDispatcher] = None,
+        # Legacy parameters maintained for backwards compatibility
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        content_filter: RelevantContentFilter = None,
+        cache_mode: Optional[CacheMode] = None,
+        bypass_cache: bool = False,
+        css_selector: str = None,
+        screenshot: bool = False,
+        pdf: bool = False,
+        user_agent: str = None,
+        verbose=True,
+        **kwargs
+        ) -> RunManyReturn:
+        """
+        Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
 
-            Migration Guide:
-            Old way (deprecated):
-                results = await crawler.arun_many(
-                    urls,
-                    word_count_threshold=200,
-                    screenshot=True,
-                    ...
-                )
-            
-            New way (recommended):
-                config = CrawlerRunConfig(
-                    word_count_threshold=200,
-                    screenshot=True,
-                    ...
-                )
-                results = await crawler.arun_many(urls, crawler_config=config)
+        Args:
+        urls: List of URLs to crawl
+        config: Configuration object controlling crawl behavior for all URLs
+        dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher
+        [other parameters maintained for backwards compatibility]
 
-            Args:
-                urls: List of URLs to crawl
-                crawler_config: Configuration object controlling crawl behavior for all URLs
-                [other parameters maintained for backwards compatibility]
-            
-            Returns:
-                List[CrawlResult]: Results for each URL
-            """
-            crawler_config = config
-            # Handle configuration
-            if crawler_config is not None:
-                if any(param is not None for param in [
-                    word_count_threshold, extraction_strategy, chunking_strategy,
-                    content_filter, cache_mode, css_selector, screenshot, pdf
-                ]):
-                    self.logger.warning(
-                        message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
-                        tag="WARNING"
-                    )
-                config = crawler_config
-            else:
-                # Merge all parameters into a single kwargs dict for config creation
-                config_kwargs = {
-                    "word_count_threshold": word_count_threshold,
-                    "extraction_strategy": extraction_strategy,
-                    "chunking_strategy": chunking_strategy,
-                    "content_filter": content_filter,
-                    "cache_mode": cache_mode,
-                    "bypass_cache": bypass_cache,
-                    "css_selector": css_selector,
-                    "screenshot": screenshot,
-                    "pdf": pdf,
-                    "verbose": verbose,
-                    **kwargs
-                }
-                config = CrawlerRunConfig.from_kwargs(config_kwargs)
+        Returns:
+        Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
+            Either a list of all results or an async generator yielding results
 
-            if bypass_cache:
-                if kwargs.get("warning", True):
-                    warnings.warn(
-                        "'bypass_cache' is deprecated and will be removed in version 0.5.0. "
-                        "Use 'cache_mode=CacheMode.BYPASS' instead. "
-                        "Pass warning=False to suppress this warning.",
-                        DeprecationWarning,
-                        stacklevel=2
-                    )
-                if config.cache_mode is None:
-                    config.cache_mode = CacheMode.BYPASS
+        Examples:
 
-            semaphore_count = config.semaphore_count or 5
-            semaphore = asyncio.Semaphore(semaphore_count)
+        # Batch processing (default)
+        results = await crawler.arun_many(
+            urls=["https://example1.com", "https://example2.com"],
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        for result in results:
+            print(f"Processed {result.url}: {len(result.markdown)} chars")
 
-            async def crawl_with_semaphore(url):
-                # Handle rate limiting per domain
-                domain = urlparse(url).netloc
-                current_time = time.time()
-                
-                self.logger.debug(
-                    message="Started task for {url:.50}...",
-                    tag="PARALLEL",
-                    params={"url": url}
-                )
-
-                # Get delay settings from config
-                mean_delay = config.mean_delay
-                max_range = config.max_range
-                
-                # Apply rate limiting
-                if domain in self._domain_last_hit:
-                    time_since_last = current_time - self._domain_last_hit[domain]
-                    if time_since_last < mean_delay:
-                        delay = mean_delay + random.uniform(0, max_range)
-                        await asyncio.sleep(delay)
-                
-                self._domain_last_hit[domain] = current_time
-
-                async with semaphore:
-                    return await self.arun(
-                        url,
-                        crawler_config=config,  # Pass the entire config object
-                        user_agent=user_agent  # Maintain user_agent override capability
-                    )
-
-            # Log start of concurrent crawling
-            self.logger.info(
-                message="Starting concurrent crawling for {count} URLs...",
-                tag="INIT",
-                params={"count": len(urls)}
+        # Streaming results
+        async for result in await crawler.arun_many(
+            urls=["https://example1.com", "https://example2.com"],
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True),
+        ):
+            print(f"Processed {result.url}: {len(result.markdown)} chars")
+        """
+        if config is None:
+            config = CrawlerRunConfig(
+                word_count_threshold=word_count_threshold,
+                extraction_strategy=extraction_strategy,
+                chunking_strategy=chunking_strategy,
+                content_filter=content_filter,
+                cache_mode=cache_mode,
+                bypass_cache=bypass_cache,
+                css_selector=css_selector,
+                screenshot=screenshot,
+                pdf=pdf,
+                verbose=verbose,
+                **kwargs,
             )
 
-            # Execute concurrent crawls
-            start_time = time.perf_counter()
-            tasks = [crawl_with_semaphore(url) for url in urls]
-            results = await asyncio.gather(*tasks, return_exceptions=True)
-            end_time = time.perf_counter()
-
-            # Log completion
-            self.logger.success(
-                message="Concurrent crawling completed for {count} URLs | Total time: {timing}",
-                tag="COMPLETE",
-                params={
-                    "count": len(urls),
-                    "timing": f"{end_time - start_time:.2f}s"
-                },
-                colors={
-                    "timing": Fore.YELLOW
-                }
+        if dispatcher is None:
+            dispatcher = MemoryAdaptiveDispatcher(
+                rate_limiter=RateLimiter(
+                    base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3
+                ),
             )
 
-            return [result if not isinstance(result, Exception) else str(result) for result in results]
+        transform_result = lambda task_result: (
+            setattr(task_result.result, 'dispatch_result', 
+                DispatchResult(
+                    task_id=task_result.task_id,
+                    memory_usage=task_result.memory_usage,
+                    peak_memory=task_result.peak_memory,
+                    start_time=task_result.start_time,
+                    end_time=task_result.end_time,
+                    error_message=task_result.error_message,
+                )
+            ) or task_result.result
+        )
+
+        stream = config.stream
+        
+        if stream:
+            async def result_transformer():
+                async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config):
+                    yield transform_result(task_result)
+            return result_transformer()
+        else:
+            _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
+            return [transform_result(res) for res in _results]    
 
     async def aclear_cache(self):
         """Clear the cache database."""
diff --git a/crawl4ai/cache_context.py b/crawl4ai/cache_context.py
index 588edd62..75914b5b 100644
--- a/crawl4ai/cache_context.py
+++ b/crawl4ai/cache_context.py
@@ -4,7 +4,7 @@ from enum import Enum
 class CacheMode(Enum):
     """
     Defines the caching behavior for web crawling operations.
-    
+
     Modes:
     - ENABLED: Normal caching behavior (read and write)
     - DISABLED: No caching at all
@@ -12,6 +12,7 @@ class CacheMode(Enum):
     - WRITE_ONLY: Only write to cache, don't read
     - BYPASS: Bypass cache for this operation
     """
+
     ENABLED = "enabled"
     DISABLED = "disabled"
     READ_ONLY = "read_only"
@@ -22,10 +23,10 @@ class CacheMode(Enum):
 class CacheContext:
     """
     Encapsulates cache-related decisions and URL handling.
-    
+
     This class centralizes all cache-related logic and URL type checking,
     making the caching behavior more predictable and maintainable.
-    
+
     Attributes:
         url (str): The URL being processed.
         cache_mode (CacheMode): The cache mode for the current operation.
@@ -36,10 +37,11 @@ class CacheContext:
         is_raw_html (bool): True if the URL is raw HTML, False otherwise.
         _url_display (str): The display name for the URL (web, local file, or raw HTML).
     """
+
     def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False):
         """
         Initializes the CacheContext with the provided URL and cache mode.
-        
+
         Args:
             url (str): The URL being processed.
             cache_mode (CacheMode): The cache mode for the current operation.
@@ -48,42 +50,42 @@ class CacheContext:
         self.url = url
         self.cache_mode = cache_mode
         self.always_bypass = always_bypass
-        self.is_cacheable = url.startswith(('http://', 'https://', 'file://'))
-        self.is_web_url = url.startswith(('http://', 'https://'))
+        self.is_cacheable = url.startswith(("http://", "https://", "file://"))
+        self.is_web_url = url.startswith(("http://", "https://"))
         self.is_local_file = url.startswith("file://")
         self.is_raw_html = url.startswith("raw:")
         self._url_display = url if not self.is_raw_html else "Raw HTML"
-    
+
     def should_read(self) -> bool:
         """
         Determines if cache should be read based on context.
-        
+
         How it works:
         1. If always_bypass is True or is_cacheable is False, return False.
         2. If cache_mode is ENABLED or READ_ONLY, return True.
-        
+
         Returns:
             bool: True if cache should be read, False otherwise.
         """
         if self.always_bypass or not self.is_cacheable:
             return False
         return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY]
-    
+
     def should_write(self) -> bool:
         """
         Determines if cache should be written based on context.
-        
+
         How it works:
         1. If always_bypass is True or is_cacheable is False, return False.
         2. If cache_mode is ENABLED or WRITE_ONLY, return True.
-        
+
         Returns:
             bool: True if cache should be written, False otherwise.
         """
         if self.always_bypass or not self.is_cacheable:
             return False
         return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY]
-    
+
     @property
     def display_url(self) -> str:
         """Returns the URL in display format."""
@@ -94,11 +96,11 @@ def _legacy_to_cache_mode(
     disable_cache: bool = False,
     bypass_cache: bool = False,
     no_cache_read: bool = False,
-    no_cache_write: bool = False
+    no_cache_write: bool = False,
 ) -> CacheMode:
     """
     Converts legacy cache parameters to the new CacheMode enum.
-    
+
     This is an internal function to help transition from the old boolean flags
     to the new CacheMode system.
     """
diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py
index 7b8c08ad..ca188d1d 100644
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -3,49 +3,53 @@ import re
 from collections import Counter
 import string
 from .model_loader import load_nltk_punkt
-from .utils import *
+
 
 # Define the abstract base class for chunking strategies
 class ChunkingStrategy(ABC):
     """
     Abstract base class for chunking strategies.
     """
-    
+
     @abstractmethod
     def chunk(self, text: str) -> list:
         """
         Abstract method to chunk the given text.
-        
+
         Args:
             text (str): The text to chunk.
-        
+
         Returns:
             list: A list of chunks.
         """
         pass
 
+
 # Create an identity chunking strategy f(x) = [x]
 class IdentityChunking(ChunkingStrategy):
     """
     Chunking strategy that returns the input text as a single chunk.
     """
+
     def chunk(self, text: str) -> list:
         return [text]
 
+
 # Regex-based chunking
 class RegexChunking(ChunkingStrategy):
     """
     Chunking strategy that splits text based on regular expression patterns.
     """
+
     def __init__(self, patterns=None, **kwargs):
         """
         Initialize the RegexChunking object.
-        
+
         Args:
             patterns (list): A list of regular expression patterns to split text.
         """
         if patterns is None:
-            patterns = [r'\n\n']  # Default split pattern
+            patterns = [r"\n\n"]  # Default split pattern
         self.patterns = patterns
 
     def chunk(self, text: str) -> list:
@@ -56,18 +60,19 @@ class RegexChunking(ChunkingStrategy):
                 new_paragraphs.extend(re.split(pattern, paragraph))
             paragraphs = new_paragraphs
         return paragraphs
-    
-# NLP-based sentence chunking 
+
+
+# NLP-based sentence chunking
 class NlpSentenceChunking(ChunkingStrategy):
     """
     Chunking strategy that splits text into sentences using NLTK's sentence tokenizer.
-    """ 
+    """
+
     def __init__(self, **kwargs):
         """
         Initialize the NlpSentenceChunking object.
         """
         load_nltk_punkt()
-        
 
     def chunk(self, text: str) -> list:
         # Improved regex for sentence splitting
@@ -75,31 +80,34 @@ class NlpSentenceChunking(ChunkingStrategy):
         #     r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s'
         # )
         # sentences = sentence_endings.split(text)
-        # sens =  [sent.strip() for sent in sentences if sent]            
+        # sens =  [sent.strip() for sent in sentences if sent]
         from nltk.tokenize import sent_tokenize
+
         sentences = sent_tokenize(text)
-        sens =  [sent.strip() for sent in sentences]        
-        
+        sens = [sent.strip() for sent in sentences]
+
         return list(set(sens))
-    
+
+
 # Topic-based segmentation using TextTiling
 class TopicSegmentationChunking(ChunkingStrategy):
     """
     Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer.
-    
+
     How it works:
     1. Segment the text into topics using TextTilingTokenizer
     2. Extract keywords for each topic segment
     """
-    
+
     def __init__(self, num_keywords=3, **kwargs):
         """
         Initialize the TopicSegmentationChunking object.
-        
+
         Args:
             num_keywords (int): The number of keywords to extract for each topic segment.
         """
         import nltk as nl
+
         self.tokenizer = nl.tokenize.TextTilingTokenizer()
         self.num_keywords = num_keywords
 
@@ -111,8 +119,14 @@ class TopicSegmentationChunking(ChunkingStrategy):
     def extract_keywords(self, text: str) -> list:
         # Tokenize and remove stopwords and punctuation
         import nltk as nl
+
         tokens = nl.toknize.word_tokenize(text)
-        tokens = [token.lower() for token in tokens if token not in nl.corpus.stopwords.words('english') and token not in string.punctuation]
+        tokens = [
+            token.lower()
+            for token in tokens
+            if token not in nl.corpus.stopwords.words("english")
+            and token not in string.punctuation
+        ]
 
         # Calculate frequency distribution
         freq_dist = Counter(tokens)
@@ -123,23 +137,27 @@ class TopicSegmentationChunking(ChunkingStrategy):
         # Segment the text into topics
         segments = self.chunk(text)
         # Extract keywords for each topic segment
-        segments_with_topics = [(segment, self.extract_keywords(segment)) for segment in segments]
+        segments_with_topics = [
+            (segment, self.extract_keywords(segment)) for segment in segments
+        ]
         return segments_with_topics
-    
+
+
 # Fixed-length word chunks
 class FixedLengthWordChunking(ChunkingStrategy):
     """
     Chunking strategy that splits text into fixed-length word chunks.
-    
+
     How it works:
     1. Split the text into words
     2. Create chunks of fixed length
     3. Return the list of chunks
     """
+
     def __init__(self, chunk_size=100, **kwargs):
         """
         Initialize the fixed-length word chunking strategy with the given chunk size.
-        
+
         Args:
             chunk_size (int): The size of each chunk in words.
         """
@@ -147,23 +165,28 @@ class FixedLengthWordChunking(ChunkingStrategy):
 
     def chunk(self, text: str) -> list:
         words = text.split()
-        return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)]
-    
+        return [
+            " ".join(words[i : i + self.chunk_size])
+            for i in range(0, len(words), self.chunk_size)
+        ]
+
+
 # Sliding window chunking
 class SlidingWindowChunking(ChunkingStrategy):
     """
     Chunking strategy that splits text into overlapping word chunks.
-    
+
     How it works:
     1. Split the text into words
     2. Create chunks of fixed length
     3. Return the list of chunks
     """
+
     def __init__(self, window_size=100, step=50, **kwargs):
         """
         Initialize the sliding window chunking strategy with the given window size and
         step size.
-        
+
         Args:
             window_size (int): The size of the sliding window in words.
             step (int): The step size for sliding the window in words.
@@ -174,35 +197,37 @@ class SlidingWindowChunking(ChunkingStrategy):
     def chunk(self, text: str) -> list:
         words = text.split()
         chunks = []
-        
+
         if len(words) <= self.window_size:
             return [text]
-        
+
         for i in range(0, len(words) - self.window_size + 1, self.step):
-            chunk = ' '.join(words[i:i + self.window_size])
+            chunk = " ".join(words[i : i + self.window_size])
             chunks.append(chunk)
-        
+
         # Handle the last chunk if it doesn't align perfectly
         if i + self.window_size < len(words):
-            chunks.append(' '.join(words[-self.window_size:]))
-        
+            chunks.append(" ".join(words[-self.window_size :]))
+
         return chunks
-    
+
+
 class OverlappingWindowChunking(ChunkingStrategy):
     """
     Chunking strategy that splits text into overlapping word chunks.
-    
+
     How it works:
     1. Split the text into words using whitespace
     2. Create chunks of fixed length equal to the window size
     3. Slide the window by the overlap size
     4. Return the list of chunks
     """
+
     def __init__(self, window_size=1000, overlap=100, **kwargs):
         """
         Initialize the overlapping window chunking strategy with the given window size and
         overlap size.
-        
+
         Args:
             window_size (int): The size of the window in words.
             overlap (int): The size of the overlap between consecutive chunks in words.
@@ -213,19 +238,19 @@ class OverlappingWindowChunking(ChunkingStrategy):
     def chunk(self, text: str) -> list:
         words = text.split()
         chunks = []
-        
+
         if len(words) <= self.window_size:
             return [text]
-        
+
         start = 0
         while start < len(words):
             end = start + self.window_size
-            chunk = ' '.join(words[start:end])
+            chunk = " ".join(words[start:end])
             chunks.append(chunk)
-            
+
             if end >= len(words):
                 break
-            
+
             start = end - self.overlap
-        
-        return chunks
\ No newline at end of file
+
+        return chunks
diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py
index 4a01c1c2..b2d2199e 100644
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -8,15 +8,22 @@ from .async_logger import AsyncLogger
 logger = AsyncLogger(verbose=True)
 docs_manager = DocsManager(logger)
 
+
 def print_table(headers: List[str], rows: List[List[str]], padding: int = 2):
     """Print formatted table with headers and rows"""
     widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
-    border = '+' + '+'.join('-' * (w + 2 * padding) for w in widths) + '+'
-    
+    border = "+" + "+".join("-" * (w + 2 * padding) for w in widths) + "+"
+
     def format_row(row):
-        return '|' + '|'.join(f"{' ' * padding}{str(cell):<{w}}{' ' * padding}" 
-                             for cell, w in zip(row, widths)) + '|'
-    
+        return (
+            "|"
+            + "|".join(
+                f"{' ' * padding}{str(cell):<{w}}{' ' * padding}"
+                for cell, w in zip(row, widths)
+            )
+            + "|"
+        )
+
     click.echo(border)
     click.echo(format_row(headers))
     click.echo(border)
@@ -24,19 +31,24 @@ def print_table(headers: List[str], rows: List[List[str]], padding: int = 2):
         click.echo(format_row(row))
     click.echo(border)
 
+
 @click.group()
 def cli():
     """Crawl4AI Command Line Interface"""
     pass
 
+
 @cli.group()
 def docs():
     """Documentation operations"""
     pass
 
+
 @docs.command()
-@click.argument('sections', nargs=-1)
-@click.option('--mode', type=click.Choice(['extended', 'condensed']), default='extended')
+@click.argument("sections", nargs=-1)
+@click.option(
+    "--mode", type=click.Choice(["extended", "condensed"]), default="extended"
+)
 def combine(sections: tuple, mode: str):
     """Combine documentation sections"""
     try:
@@ -46,16 +58,17 @@ def combine(sections: tuple, mode: str):
         logger.error(str(e), tag="ERROR")
         sys.exit(1)
 
+
 @docs.command()
-@click.argument('query')
-@click.option('--top-k', '-k', default=5)
-@click.option('--build-index', is_flag=True, help='Build index if missing')
+@click.argument("query")
+@click.option("--top-k", "-k", default=5)
+@click.option("--build-index", is_flag=True, help="Build index if missing")
 def search(query: str, top_k: int, build_index: bool):
     """Search documentation"""
     try:
         result = docs_manager.search(query, top_k)
         if result == "No search index available. Call build_search_index() first.":
-            if build_index or click.confirm('No search index found. Build it now?'):
+            if build_index or click.confirm("No search index found. Build it now?"):
                 asyncio.run(docs_manager.llm_text.generate_index_files())
                 result = docs_manager.search(query, top_k)
         click.echo(result)
@@ -63,6 +76,7 @@ def search(query: str, top_k: int, build_index: bool):
         click.echo(f"Error: {str(e)}", err=True)
         sys.exit(1)
 
+
 @docs.command()
 def update():
     """Update docs from GitHub"""
@@ -73,22 +87,25 @@ def update():
         click.echo(f"Error: {str(e)}", err=True)
         sys.exit(1)
 
+
 @docs.command()
-@click.option('--force-facts', is_flag=True, help='Force regenerate fact files')
-@click.option('--clear-cache', is_flag=True, help='Clear BM25 cache')
+@click.option("--force-facts", is_flag=True, help="Force regenerate fact files")
+@click.option("--clear-cache", is_flag=True, help="Clear BM25 cache")
 def index(force_facts: bool, clear_cache: bool):
     """Build or rebuild search indexes"""
     try:
         asyncio.run(docs_manager.ensure_docs_exist())
-        asyncio.run(docs_manager.llm_text.generate_index_files(
-            force_generate_facts=force_facts,
-            clear_bm25_cache=clear_cache
-        ))
+        asyncio.run(
+            docs_manager.llm_text.generate_index_files(
+                force_generate_facts=force_facts, clear_bm25_cache=clear_cache
+            )
+        )
         click.echo("Search indexes built successfully")
     except Exception as e:
         click.echo(f"Error: {str(e)}", err=True)
         sys.exit(1)
 
+
 # Add docs list command
 @docs.command()
 def list():
@@ -96,10 +113,11 @@ def list():
     try:
         sections = docs_manager.list()
         print_table(["Sections"], [[section] for section in sections])
-        
+
     except Exception as e:
         click.echo(f"Error: {str(e)}", err=True)
         sys.exit(1)
 
-if __name__ == '__main__':
-    cli()
\ No newline at end of file
+
+if __name__ == "__main__":
+    cli()
diff --git a/crawl4ai/config.py b/crawl4ai/config.py
index c2be7638..3e26514a 100644
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -8,7 +8,7 @@ DEFAULT_PROVIDER = "openai/gpt-4o-mini"
 MODEL_REPO_BRANCH = "new-release-0.0.2"
 # Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
 PROVIDER_MODELS = {
-    "ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
+    "ollama/llama3": "no-token-needed",  # Any model from Ollama no need for API token
     "groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
     "groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
     "openai/gpt-4o-mini": os.getenv("OPENAI_API_KEY"),
@@ -22,27 +22,49 @@ PROVIDER_MODELS = {
 }
 
 # Chunk token threshold
-CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens
+CHUNK_TOKEN_THRESHOLD = 2**11  # 2048 tokens
 OVERLAP_RATE = 0.1
 WORD_TOKEN_RATE = 1.3
 
-# Threshold for the minimum number of word in a HTML tag to be considered 
+# Threshold for the minimum number of word in a HTML tag to be considered
 MIN_WORD_THRESHOLD = 1
 IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1
 
-IMPORTANT_ATTRS = ['src', 'href', 'alt', 'title', 'width', 'height'] 
-ONLY_TEXT_ELIGIBLE_TAGS = ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']
+IMPORTANT_ATTRS = ["src", "href", "alt", "title", "width", "height"]
+ONLY_TEXT_ELIGIBLE_TAGS = [
+    "b",
+    "i",
+    "u",
+    "span",
+    "del",
+    "ins",
+    "sub",
+    "sup",
+    "strong",
+    "em",
+    "code",
+    "kbd",
+    "var",
+    "s",
+    "q",
+    "abbr",
+    "cite",
+    "dfn",
+    "time",
+    "small",
+    "mark",
+]
 SOCIAL_MEDIA_DOMAINS = [
-                            'facebook.com',
-                            'twitter.com',
-                            'x.com',
-                            'linkedin.com',
-                            'instagram.com',
-                            'pinterest.com',
-                            'tiktok.com',
-                            'snapchat.com',
-                            'reddit.com',
-                        ]
+    "facebook.com",
+    "twitter.com",
+    "x.com",
+    "linkedin.com",
+    "instagram.com",
+    "pinterest.com",
+    "tiktok.com",
+    "snapchat.com",
+    "reddit.com",
+]
 
 # Threshold for the Image extraction - Range is 1 to 6
 # Images are scored based on point based system, to filter based on usefulness. Points are assigned
@@ -60,5 +82,5 @@ NEED_MIGRATION = True
 URL_LOG_SHORTEN_LENGTH = 30
 SHOW_DEPRECATION_WARNINGS = True
 SCREENSHOT_HEIGHT_TRESHOLD = 10000
-PAGE_TIMEOUT=60000
-DOWNLOAD_PAGE_TIMEOUT=60000
\ No newline at end of file
+PAGE_TIMEOUT = 60000
+DOWNLOAD_PAGE_TIMEOUT = 60000
diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py
index ce433118..75702ec5 100644
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -1,59 +1,110 @@
 import re
+import time
 from bs4 import BeautifulSoup, Tag
-from typing import List, Tuple, Dict
+from typing import List, Tuple, Dict, Optional
 from rank_bm25 import BM25Okapi
-from time import perf_counter
 from collections import deque
-from bs4 import BeautifulSoup, NavigableString, Tag, Comment
-from .utils import clean_tokens
+from bs4 import NavigableString, Comment
+from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data
 from abc import ABC, abstractmethod
 import math
 from snowballstemmer import stemmer
+from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
+from .models import TokenUsage
+from .prompts import PROMPT_FILTER_CONTENT
+import os
+import json
+import hashlib
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from .async_logger import AsyncLogger, LogLevel
+from colorama import Fore, Style, init
+
 class RelevantContentFilter(ABC):
     """Abstract base class for content filtering strategies"""
+
     def __init__(self, user_query: str = None):
         self.user_query = user_query
         self.included_tags = {
             # Primary structure
-            'article', 'main', 'section', 'div', 
+            "article",
+            "main",
+            "section",
+            "div",
             # List structures
-            'ul', 'ol', 'li', 'dl', 'dt', 'dd',
+            "ul",
+            "ol",
+            "li",
+            "dl",
+            "dt",
+            "dd",
             # Text content
-            'p', 'span', 'blockquote', 'pre', 'code',
+            "p",
+            "span",
+            "blockquote",
+            "pre",
+            "code",
             # Headers
-            'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+            "h1",
+            "h2",
+            "h3",
+            "h4",
+            "h5",
+            "h6",
             # Tables
-            'table', 'thead', 'tbody', 'tr', 'td', 'th',
+            "table",
+            "thead",
+            "tbody",
+            "tr",
+            "td",
+            "th",
             # Other semantic elements
-            'figure', 'figcaption', 'details', 'summary',
+            "figure",
+            "figcaption",
+            "details",
+            "summary",
             # Text formatting
-            'em', 'strong', 'b', 'i', 'mark', 'small',
+            "em",
+            "strong",
+            "b",
+            "i",
+            "mark",
+            "small",
             # Rich content
-            'time', 'address', 'cite', 'q'
+            "time",
+            "address",
+            "cite",
+            "q",
         }
         self.excluded_tags = {
-            'nav', 'footer', 'header', 'aside', 'script',
-            'style', 'form', 'iframe', 'noscript'
+            "nav",
+            "footer",
+            "header",
+            "aside",
+            "script",
+            "style",
+            "form",
+            "iframe",
+            "noscript",
         }
-        self.header_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
+        self.header_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
         self.negative_patterns = re.compile(
-            r'nav|footer|header|sidebar|ads|comment|promo|advert|social|share',
-            re.I
+            r"nav|footer|header|sidebar|ads|comment|promo|advert|social|share", re.I
         )
         self.min_word_count = 2
-        
+
     @abstractmethod
     def filter_content(self, html: str) -> List[str]:
         """Abstract method to be implemented by specific filtering strategies"""
         pass
-    
+
     def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str:
         """Common method to extract page metadata with fallbacks"""
         if self.user_query:
             return self.user_query
 
         query_parts = []
-        
+
         # Title
         try:
             title = soup.title.string
@@ -62,109 +113,145 @@ class RelevantContentFilter(ABC):
         except Exception:
             pass
 
-        if soup.find('h1'):
-            query_parts.append(soup.find('h1').get_text())
-            
+        if soup.find("h1"):
+            query_parts.append(soup.find("h1").get_text())
+
         # Meta tags
         temp = ""
-        for meta_name in ['keywords', 'description']:
-            meta = soup.find('meta', attrs={'name': meta_name})
-            if meta and meta.get('content'):
-                query_parts.append(meta['content'])
-                temp += meta['content']
-                
+        for meta_name in ["keywords", "description"]:
+            meta = soup.find("meta", attrs={"name": meta_name})
+            if meta and meta.get("content"):
+                query_parts.append(meta["content"])
+                temp += meta["content"]
+
         # If still empty, grab first significant paragraph
         if not temp:
             # Find the first tag P thatits text contains more than 50 characters
-            for p in body.find_all('p'):
+            for p in body.find_all("p"):
                 if len(p.get_text()) > 150:
                     query_parts.append(p.get_text()[:150])
-                    break        
-                                
-        return ' '.join(filter(None, query_parts))
+                    break
 
-    def extract_text_chunks(self, body: Tag, min_word_threshold: int = None) -> List[Tuple[str, str]]:
+        return " ".join(filter(None, query_parts))
+
+    def extract_text_chunks(
+        self, body: Tag, min_word_threshold: int = None
+    ) -> List[Tuple[str, str]]:
         """
         Extracts text chunks from a BeautifulSoup body element while preserving order.
         Returns list of tuples (text, tag_name) for classification.
-        
+
         Args:
             body: BeautifulSoup Tag object representing the body element
-            
+
         Returns:
             List of (text, tag_name) tuples
         """
         # Tags to ignore - inline elements that shouldn't break text flow
         INLINE_TAGS = {
-            'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite', 'code',
-            'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 'object', 'q',
-            'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup',
-            'textarea', 'time', 'tt', 'var'
+            "a",
+            "abbr",
+            "acronym",
+            "b",
+            "bdo",
+            "big",
+            "br",
+            "button",
+            "cite",
+            "code",
+            "dfn",
+            "em",
+            "i",
+            "img",
+            "input",
+            "kbd",
+            "label",
+            "map",
+            "object",
+            "q",
+            "samp",
+            "script",
+            "select",
+            "small",
+            "span",
+            "strong",
+            "sub",
+            "sup",
+            "textarea",
+            "time",
+            "tt",
+            "var",
         }
-        
+
         # Tags that typically contain meaningful headers
-        HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header'}
-        
+        HEADER_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "header"}
+
         chunks = []
         current_text = []
         chunk_index = 0
-    
+
         def should_break_chunk(tag: Tag) -> bool:
             """Determine if a tag should cause a break in the current text chunk"""
-            return (
-                tag.name not in INLINE_TAGS
-                and not (tag.name == 'p' and len(current_text) == 0)
+            return tag.name not in INLINE_TAGS and not (
+                tag.name == "p" and len(current_text) == 0
             )
-        
+
         # Use deque for efficient push/pop operations
         stack = deque([(body, False)])
-        
+
         while stack:
             element, visited = stack.pop()
-            
+
             if visited:
                 # End of block element - flush accumulated text
                 if current_text and should_break_chunk(element):
-                    text = ' '.join(''.join(current_text).split())
+                    text = " ".join("".join(current_text).split())
                     if text:
-                        tag_type = 'header' if element.name in HEADER_TAGS else 'content'
+                        tag_type = (
+                            "header" if element.name in HEADER_TAGS else "content"
+                        )
                         chunks.append((chunk_index, text, tag_type, element))
                         chunk_index += 1
                     current_text = []
                 continue
-                
+
             if isinstance(element, NavigableString):
                 if str(element).strip():
                     current_text.append(str(element).strip())
                 continue
-                
+
             # Pre-allocate children to avoid multiple list operations
             children = list(element.children)
             if not children:
                 continue
-                
+
             # Mark block for revisit after processing children
             stack.append((element, True))
-            
+
             # Add children in reverse order for correct processing
             for child in reversed(children):
                 if isinstance(child, (Tag, NavigableString)):
                     stack.append((child, False))
-        
+
         # Handle any remaining text
         if current_text:
-            text = ' '.join(''.join(current_text).split())
+            text = " ".join("".join(current_text).split())
             if text:
-                chunks.append((chunk_index, text, 'content', body))
-        
-        if min_word_threshold:
-            chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold]
-        
-        return chunks    
+                chunks.append((chunk_index, text, "content", body))
 
-    def _deprecated_extract_text_chunks(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]:
+        if min_word_threshold:
+            chunks = [
+                chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold
+            ]
+
+        return chunks
+
+    def _deprecated_extract_text_chunks(
+        self, soup: BeautifulSoup
+    ) -> List[Tuple[int, str, Tag]]:
         """Common method for extracting text chunks"""
         _text_cache = {}
+
         def fast_text(element: Tag) -> str:
             elem_id = id(element)
             if elem_id in _text_cache:
@@ -175,13 +262,13 @@ class RelevantContentFilter(ABC):
                     text = content.strip()
                     if text:
                         texts.append(text)
-            result = ' '.join(texts)
+            result = " ".join(texts)
             _text_cache[elem_id] = result
             return result
-        
+
         candidates = []
         index = 0
-        
+
         def dfs(element):
             nonlocal index
             if isinstance(element, Tag):
@@ -189,7 +276,7 @@ class RelevantContentFilter(ABC):
                     if not self.is_excluded(element):
                         text = fast_text(element)
                         word_count = len(text.split())
-                        
+
                         # Headers pass through with adjusted minimum
                         if element.name in self.header_tags:
                             if word_count >= 3:  # Minimal sanity check for headers
@@ -199,7 +286,7 @@ class RelevantContentFilter(ABC):
                         elif word_count >= self.min_word_count:
                             candidates.append((index, text, element))
                             index += 1
-                            
+
                 for child in element.children:
                     dfs(child)
 
@@ -210,59 +297,66 @@ class RelevantContentFilter(ABC):
         """Common method for exclusion logic"""
         if tag.name in self.excluded_tags:
             return True
-        class_id = ' '.join(filter(None, [
-            ' '.join(tag.get('class', [])),
-            tag.get('id', '')
-        ]))
+        class_id = " ".join(
+            filter(None, [" ".join(tag.get("class", [])), tag.get("id", "")])
+        )
         return bool(self.negative_patterns.search(class_id))
 
     def clean_element(self, tag: Tag) -> str:
         """Common method for cleaning HTML elements with minimal overhead"""
         if not tag or not isinstance(tag, Tag):
             return ""
-            
-        unwanted_tags = {'script', 'style', 'aside', 'form', 'iframe', 'noscript'}
-        unwanted_attrs = {'style', 'onclick', 'onmouseover', 'align', 'bgcolor', 'class', 'id'}
-        
+
+        unwanted_tags = {"script", "style", "aside", "form", "iframe", "noscript"}
+        unwanted_attrs = {
+            "style",
+            "onclick",
+            "onmouseover",
+            "align",
+            "bgcolor",
+            "class",
+            "id",
+        }
+
         # Use string builder pattern for better performance
         builder = []
-        
+
         def render_tag(elem):
             if not isinstance(elem, Tag):
                 if isinstance(elem, str):
                     builder.append(elem.strip())
                 return
-                
+
             if elem.name in unwanted_tags:
                 return
-                
+
             # Start tag
-            builder.append(f'<{elem.name}')
-            
+            builder.append(f"<{elem.name}")
+
             # Add cleaned attributes
             attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs}
             for key, value in attrs.items():
                 builder.append(f' {key}="{value}"')
-                
-            builder.append('>')
-            
+
+            builder.append(">")
+
             # Process children
             for child in elem.children:
                 render_tag(child)
-                
+
             # Close tag
-            builder.append(f'</{elem.name}>')
-        
+            builder.append(f"</{elem.name}>")
+
         try:
             render_tag(tag)
-            return ''.join(builder)
+            return "".join(builder)
         except Exception:
             return str(tag)  # Fallback to original if anything fails
 
 class BM25ContentFilter(RelevantContentFilter):
     """
     Content filtering using BM25 algorithm with priority tag handling.
-    
+
     How it works:
     1. Extracts page metadata with fallbacks.
     2. Extracts text chunks from the body element.
@@ -271,22 +365,28 @@ class BM25ContentFilter(RelevantContentFilter):
     5. Filters out chunks below the threshold.
     6. Sorts chunks by score in descending order.
     7. Returns the top N chunks.
-    
+
     Attributes:
         user_query (str): User query for filtering (optional).
         bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
         language (str): Language for stemming (default: 'english').
-        
+
         Methods:
             filter_content(self, html: str, min_word_threshold: int = None)
     """
-    def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'):
+
+    def __init__(
+        self,
+        user_query: str = None,
+        bm25_threshold: float = 1.0,
+        language: str = "english",
+    ):
         """
         Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
-        
+
         Note:
         If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
-        
+
         Args:
             user_query (str): User query for filtering (optional).
             bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
@@ -295,52 +395,52 @@ class BM25ContentFilter(RelevantContentFilter):
         super().__init__(user_query=user_query)
         self.bm25_threshold = bm25_threshold
         self.priority_tags = {
-            'h1': 5.0,
-            'h2': 4.0,
-            'h3': 3.0,
-            'title': 4.0,
-            'strong': 2.0,
-            'b': 1.5,
-            'em': 1.5,
-            'blockquote': 2.0,
-            'code': 2.0,
-            'pre': 1.5,
-            'th': 1.5,  # Table headers
+            "h1": 5.0,
+            "h2": 4.0,
+            "h3": 3.0,
+            "title": 4.0,
+            "strong": 2.0,
+            "b": 1.5,
+            "em": 1.5,
+            "blockquote": 2.0,
+            "code": 2.0,
+            "pre": 1.5,
+            "th": 1.5,  # Table headers
         }
         self.stemmer = stemmer(language)
 
     def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
         """
         Implements content filtering using BM25 algorithm with priority tag handling.
-        
+
             Note:
         This method implements the filtering logic for the BM25ContentFilter class.
         It takes HTML content as input and returns a list of filtered text chunks.
-        
+
         Args:
             html (str): HTML content to be filtered.
             min_word_threshold (int): Minimum word threshold for filtering (optional).
-        
+
         Returns:
             List[str]: List of filtered text chunks.
         """
         if not html or not isinstance(html, str):
             return []
 
-        soup = BeautifulSoup(html, 'lxml')
-        
+        soup = BeautifulSoup(html, "lxml")
+
         # Check if body is present
         if not soup.body:
             # Wrap in body tag if missing
-            soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')        
-        body = soup.find('body')
-        
+            soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
+        body = soup.find("body")
+
         query = self.extract_page_query(soup, body)
-        
+
         if not query:
             return []
             # return [self.clean_element(soup)]
-            
+
         candidates = self.extract_text_chunks(body, min_word_threshold)
 
         if not candidates:
@@ -349,16 +449,20 @@ class BM25ContentFilter(RelevantContentFilter):
         # Tokenize corpus
         # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates]
         # tokenized_query = query.lower().split()
-                
-        # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()] 
-        #                 for _, chunk, _, _ in candidates]
-        # tokenized_query = [ps.stem(word) for word in query.lower().split()]        
-        
-        tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()] 
-                   for _, chunk, _, _ in candidates]
-        tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()]
 
-        # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())] 
+        # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()]
+        #                 for _, chunk, _, _ in candidates]
+        # tokenized_query = [ps.stem(word) for word in query.lower().split()]
+
+        tokenized_corpus = [
+            [self.stemmer.stemWord(word) for word in chunk.lower().split()]
+            for _, chunk, _, _ in candidates
+        ]
+        tokenized_query = [
+            self.stemmer.stemWord(word) for word in query.lower().split()
+        ]
+
+        # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
         #            for _, chunk, _, _ in candidates]
         # tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())]
 
@@ -378,7 +482,8 @@ class BM25ContentFilter(RelevantContentFilter):
 
         # Filter candidates by threshold
         selected_candidates = [
-            (index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates
+            (index, chunk, tag)
+            for adjusted_score, index, chunk, tag in adjusted_candidates
             if adjusted_score >= self.bm25_threshold
         ]
 
@@ -393,7 +498,7 @@ class BM25ContentFilter(RelevantContentFilter):
 class PruningContentFilter(RelevantContentFilter):
     """
     Content filtering using pruning algorithm with dynamic threshold.
-    
+
     How it works:
     1. Extracts page metadata with fallbacks.
     2. Extracts text chunks from the body element.
@@ -407,18 +512,24 @@ class PruningContentFilter(RelevantContentFilter):
         min_word_threshold (int): Minimum word threshold for filtering (optional).
         threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
         threshold (float): Fixed threshold value (default: 0.48).
-        
+
         Methods:
             filter_content(self, html: str, min_word_threshold: int = None):
     """
-    def __init__(self, user_query: str = None, min_word_threshold: int = None, 
-                 threshold_type: str = 'fixed', threshold: float = 0.48):
+
+    def __init__(
+        self,
+        user_query: str = None,
+        min_word_threshold: int = None,
+        threshold_type: str = "fixed",
+        threshold: float = 0.48,
+    ):
         """
         Initializes the PruningContentFilter class, if not provided, falls back to page metadata.
-        
+
         Note:
         If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
-        
+
         Args:
             user_query (str): User query for filtering (optional).
             min_word_threshold (int): Minimum word threshold for filtering (optional).
@@ -429,92 +540,92 @@ class PruningContentFilter(RelevantContentFilter):
         self.min_word_threshold = min_word_threshold
         self.threshold_type = threshold_type
         self.threshold = threshold
-        
+
         # Add tag importance for dynamic threshold
         self.tag_importance = {
-            'article': 1.5,
-            'main': 1.4,
-            'section': 1.3,
-            'p': 1.2,
-            'h1': 1.4,
-            'h2': 1.3,
-            'h3': 1.2,
-            'div': 0.7,
-            'span': 0.6
+            "article": 1.5,
+            "main": 1.4,
+            "section": 1.3,
+            "p": 1.2,
+            "h1": 1.4,
+            "h2": 1.3,
+            "h3": 1.2,
+            "div": 0.7,
+            "span": 0.6,
         }
-        
+
         # Metric configuration
         self.metric_config = {
-            'text_density': True,
-            'link_density': True,
-            'tag_weight': True,
-            'class_id_weight': True,
-            'text_length': True,
+            "text_density": True,
+            "link_density": True,
+            "tag_weight": True,
+            "class_id_weight": True,
+            "text_length": True,
         }
-        
+
         self.metric_weights = {
-            'text_density': 0.4,
-            'link_density': 0.2,
-            'tag_weight': 0.2,
-            'class_id_weight': 0.1,
-            'text_length': 0.1,
+            "text_density": 0.4,
+            "link_density": 0.2,
+            "tag_weight": 0.2,
+            "class_id_weight": 0.1,
+            "text_length": 0.1,
         }
-        
+
         self.tag_weights = {
-            'div': 0.5,
-            'p': 1.0,
-            'article': 1.5,
-            'section': 1.0,
-            'span': 0.3,
-            'li': 0.5,
-            'ul': 0.5,
-            'ol': 0.5,
-            'h1': 1.2,
-            'h2': 1.1,
-            'h3': 1.0,
-            'h4': 0.9,
-            'h5': 0.8,
-            'h6': 0.7,
+            "div": 0.5,
+            "p": 1.0,
+            "article": 1.5,
+            "section": 1.0,
+            "span": 0.3,
+            "li": 0.5,
+            "ul": 0.5,
+            "ol": 0.5,
+            "h1": 1.2,
+            "h2": 1.1,
+            "h3": 1.0,
+            "h4": 0.9,
+            "h5": 0.8,
+            "h6": 0.7,
         }
 
     def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
         """
         Implements content filtering using pruning algorithm with dynamic threshold.
-        
+
         Note:
         This method implements the filtering logic for the PruningContentFilter class.
         It takes HTML content as input and returns a list of filtered text chunks.
-        
+
         Args:
             html (str): HTML content to be filtered.
             min_word_threshold (int): Minimum word threshold for filtering (optional).
-        
+
         Returns:
             List[str]: List of filtered text chunks.
         """
         if not html or not isinstance(html, str):
             return []
-            
-        soup = BeautifulSoup(html, 'lxml')
+
+        soup = BeautifulSoup(html, "lxml")
         if not soup.body:
-            soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
-        
+            soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
+
         # Remove comments and unwanted tags
         self._remove_comments(soup)
         self._remove_unwanted_tags(soup)
-        
+
         # Prune tree starting from body
-        body = soup.find('body')
+        body = soup.find("body")
         self._prune_tree(body)
-        
+
         # Extract remaining content as list of HTML strings
         content_blocks = []
         for element in body.children:
-            if isinstance(element, str) or not hasattr(element, 'name'):
+            if isinstance(element, str) or not hasattr(element, "name"):
                 continue
             if len(element.get_text(strip=True)) > 0:
                 content_blocks.append(str(element))
-                
+
         return content_blocks
 
     def _remove_comments(self, soup):
@@ -531,34 +642,38 @@ class PruningContentFilter(RelevantContentFilter):
     def _prune_tree(self, node):
         """
         Prunes the tree starting from the given node.
-        
+
         Args:
             node (Tag): The node from which the pruning starts.
         """
-        if not node or not hasattr(node, 'name') or node.name is None:
+        if not node or not hasattr(node, "name") or node.name is None:
             return
 
         text_len = len(node.get_text(strip=True))
-        tag_len = len(node.encode_contents().decode('utf-8'))
-        link_text_len = sum(len(s.strip()) for s in (a.string for a in node.find_all('a', recursive=False)) if s)
+        tag_len = len(node.encode_contents().decode("utf-8"))
+        link_text_len = sum(
+            len(s.strip())
+            for s in (a.string for a in node.find_all("a", recursive=False))
+            if s
+        )
 
         metrics = {
-            'node': node,
-            'tag_name': node.name,
-            'text_len': text_len,
-            'tag_len': tag_len,
-            'link_text_len': link_text_len
+            "node": node,
+            "tag_name": node.name,
+            "text_len": text_len,
+            "tag_len": tag_len,
+            "link_text_len": link_text_len,
         }
 
         score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len)
 
-        if self.threshold_type == 'fixed':
+        if self.threshold_type == "fixed":
             should_remove = score < self.threshold
         else:  # dynamic
             tag_importance = self.tag_importance.get(node.name, 0.7)
             text_ratio = text_len / tag_len if tag_len > 0 else 0
             link_ratio = link_text_len / text_len if text_len > 0 else 1
-            
+
             threshold = self.threshold  # base threshold
             if tag_importance > 1:
                 threshold *= 0.8
@@ -566,13 +681,13 @@ class PruningContentFilter(RelevantContentFilter):
                 threshold *= 0.9
             if link_ratio > 0.6:
                 threshold *= 1.2
-                
+
             should_remove = score < threshold
 
         if should_remove:
             node.decompose()
         else:
-            children = [child for child in node.children if hasattr(child, 'name')]
+            children = [child for child in node.children if hasattr(child, "name")]
             for child in children:
                 self._prune_tree(child)
 
@@ -580,48 +695,305 @@ class PruningContentFilter(RelevantContentFilter):
         """Computes the composite score"""
         if self.min_word_threshold:
             # Get raw text from metrics node - avoid extra processing
-            text = metrics['node'].get_text(strip=True)
-            word_count = text.count(' ') + 1
+            text = metrics["node"].get_text(strip=True)
+            word_count = text.count(" ") + 1
             if word_count < self.min_word_threshold:
                 return -1.0  # Guaranteed removal
         score = 0.0
         total_weight = 0.0
 
-        if self.metric_config['text_density']:
+        if self.metric_config["text_density"]:
             density = text_len / tag_len if tag_len > 0 else 0
-            score += self.metric_weights['text_density'] * density
-            total_weight += self.metric_weights['text_density']
+            score += self.metric_weights["text_density"] * density
+            total_weight += self.metric_weights["text_density"]
 
-        if self.metric_config['link_density']:
+        if self.metric_config["link_density"]:
             density = 1 - (link_text_len / text_len if text_len > 0 else 0)
-            score += self.metric_weights['link_density'] * density
-            total_weight += self.metric_weights['link_density']
+            score += self.metric_weights["link_density"] * density
+            total_weight += self.metric_weights["link_density"]
 
-        if self.metric_config['tag_weight']:
-            tag_score = self.tag_weights.get(metrics['tag_name'], 0.5)
-            score += self.metric_weights['tag_weight'] * tag_score
-            total_weight += self.metric_weights['tag_weight']
+        if self.metric_config["tag_weight"]:
+            tag_score = self.tag_weights.get(metrics["tag_name"], 0.5)
+            score += self.metric_weights["tag_weight"] * tag_score
+            total_weight += self.metric_weights["tag_weight"]
 
-        if self.metric_config['class_id_weight']:
-            class_score = self._compute_class_id_weight(metrics['node'])
-            score += self.metric_weights['class_id_weight'] * max(0, class_score)
-            total_weight += self.metric_weights['class_id_weight']
+        if self.metric_config["class_id_weight"]:
+            class_score = self._compute_class_id_weight(metrics["node"])
+            score += self.metric_weights["class_id_weight"] * max(0, class_score)
+            total_weight += self.metric_weights["class_id_weight"]
 
-        if self.metric_config['text_length']:
-            score += self.metric_weights['text_length'] * math.log(text_len + 1)
-            total_weight += self.metric_weights['text_length']
+        if self.metric_config["text_length"]:
+            score += self.metric_weights["text_length"] * math.log(text_len + 1)
+            total_weight += self.metric_weights["text_length"]
 
         return score / total_weight if total_weight > 0 else 0
 
     def _compute_class_id_weight(self, node):
         """Computes the class ID weight"""
         class_id_score = 0
-        if 'class' in node.attrs:
-            classes = ' '.join(node['class'])
+        if "class" in node.attrs:
+            classes = " ".join(node["class"])
             if self.negative_patterns.match(classes):
                 class_id_score -= 0.5
-        if 'id' in node.attrs:
-            element_id = node['id']
+        if "id" in node.attrs:
+            element_id = node["id"]
             if self.negative_patterns.match(element_id):
                 class_id_score -= 0.5
-        return class_id_score
\ No newline at end of file
+        return class_id_score
+
+class LLMContentFilter(RelevantContentFilter):
+    """Content filtering using LLMs to generate relevant markdown."""
+
+    def __init__(
+        self,
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        instruction: str = None,
+        chunk_token_threshold: int = int(1e9),
+        overlap_rate: float = OVERLAP_RATE,
+        word_token_rate: float = WORD_TOKEN_RATE,
+        base_url: Optional[str] = None,
+        api_base: Optional[str] = None,
+        extra_args: Dict = None,
+        verbose: bool = False,
+        logger: Optional[AsyncLogger] = None,
+    ):
+        super().__init__(None)
+        self.provider = provider
+        self.api_token = (
+            api_token
+            or PROVIDER_MODELS.get(provider, "no-token")
+            or os.getenv("OPENAI_API_KEY")
+        )
+        self.instruction = instruction
+        self.chunk_token_threshold = chunk_token_threshold
+        self.overlap_rate = overlap_rate
+        self.word_token_rate = word_token_rate
+        self.base_url = base_url
+        self.api_base = api_base or base_url
+        self.extra_args = extra_args or {}
+        self.verbose = verbose
+        
+        # Setup logger with custom styling for LLM operations
+        if logger:
+            self.logger = logger
+        elif verbose:
+            self.logger = AsyncLogger(
+                verbose=True,
+                icons={
+                    **AsyncLogger.DEFAULT_ICONS,
+                    "LLM": "★",  # Star for LLM operations
+                    "CHUNK": "◈",  # Diamond for chunks
+                    "CACHE": "⚡", # Lightning for cache operations
+                },
+                colors={
+                    **AsyncLogger.DEFAULT_COLORS,
+                    LogLevel.INFO: Fore.MAGENTA + Style.DIM,  # Dimmed purple for LLM ops
+                }
+            )
+        else:
+            self.logger = None
+        
+        self.usages = []
+        self.total_usage = TokenUsage()
+
+    def _get_cache_key(self, html: str, instruction: str) -> str:
+        """Generate a unique cache key based on HTML and instruction"""
+        content = f"{html}{instruction}"
+        return hashlib.md5(content.encode()).hexdigest()
+
+    def _merge_chunks(self, text: str) -> List[str]:
+        """Split text into chunks with overlap"""
+        # Calculate tokens and sections
+        total_tokens = len(text.split()) * self.word_token_rate
+        num_sections = max(1, math.floor(total_tokens / self.chunk_token_threshold))
+        adjusted_chunk_threshold = total_tokens / num_sections
+
+        # Split into words
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_token_count = 0
+
+        for word in words:
+            word_tokens = len(word) * self.word_token_rate
+            if current_token_count + word_tokens <= adjusted_chunk_threshold:
+                current_chunk.append(word)
+                current_token_count += word_tokens
+            else:
+                # Add overlap if not the last chunk
+                if chunks and self.overlap_rate > 0:
+                    overlap_size = int(len(current_chunk) * self.overlap_rate)
+                    current_chunk.extend(current_chunk[-overlap_size:])
+                
+                chunks.append(" ".join(current_chunk))
+                current_chunk = [word]
+                current_token_count = word_tokens
+
+        if current_chunk:
+            chunks.append(" ".join(current_chunk))
+
+        return chunks
+
+    def filter_content(self, html: str, ignore_cache: bool = False) -> List[str]:
+        if not html or not isinstance(html, str):
+            return []
+
+        if self.logger:
+            self.logger.info(
+                "Starting LLM content filtering process", 
+                tag="LLM",
+                params={"provider": self.provider},
+                colors={"provider": Fore.CYAN}
+            )
+
+        # Cache handling
+        cache_dir = Path(get_home_folder()) / "llm_cache" / "content_filter"
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        cache_key = self._get_cache_key(html, self.instruction or "")
+        cache_file = cache_dir / f"{cache_key}.json"
+
+        if not ignore_cache and cache_file.exists():
+            if self.logger:
+                self.logger.info("Found cached result", tag="CACHE")
+            try:
+                with cache_file.open('r') as f:
+                    cached_data = json.load(f)
+                    usage = TokenUsage(**cached_data['usage'])
+                    self.usages.append(usage)
+                    self.total_usage.completion_tokens += usage.completion_tokens
+                    self.total_usage.prompt_tokens += usage.prompt_tokens
+                    self.total_usage.total_tokens += usage.total_tokens
+                    return cached_data['blocks']
+            except Exception as e:
+                if self.logger:
+                    self.logger.error(f"Cache read error: {str(e)}", tag="CACHE")
+
+        # Split into chunks
+        html_chunks = self._merge_chunks(html)
+        if self.logger:
+            self.logger.info(
+                "Split content into {chunk_count} chunks", 
+                tag="CHUNK",
+                params={"chunk_count": len(html_chunks)},
+                colors={"chunk_count": Fore.YELLOW}
+            )
+        
+        extracted_content = []
+        start_time = time.time()
+        
+        # Process chunks in parallel
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            futures = []
+            for i, chunk in enumerate(html_chunks):
+                if self.logger:
+                    self.logger.debug(
+                        "Processing chunk {chunk_num}/{total_chunks}", 
+                        tag="CHUNK",
+                        params={
+                            "chunk_num": i + 1,
+                            "total_chunks": len(html_chunks)
+                        }
+                    )
+
+                prompt_variables = {
+                    "HTML": escape_json_string(sanitize_html(chunk)),
+                    "REQUEST": self.instruction or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content."
+                }
+
+                prompt = PROMPT_FILTER_CONTENT
+                for var, value in prompt_variables.items():
+                    prompt = prompt.replace("{" + var + "}", value)
+
+                future = executor.submit(
+                    perform_completion_with_backoff,
+                    self.provider,
+                    prompt,
+                    self.api_token,
+                    base_url=self.api_base,
+                    extra_args=self.extra_args
+                )
+                futures.append((i, future))
+
+            # Collect results in order
+            ordered_results = []
+            for i, future in sorted(futures):
+                try:
+                    response = future.result()
+                    
+                    # Track usage
+                    usage = TokenUsage(
+                        completion_tokens=response.usage.completion_tokens,
+                        prompt_tokens=response.usage.prompt_tokens,
+                        total_tokens=response.usage.total_tokens,
+                        completion_tokens_details=response.usage.completion_tokens_details.__dict__ 
+                        if response.usage.completion_tokens_details else {},
+                        prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
+                        if response.usage.prompt_tokens_details else {},
+                    )
+                    self.usages.append(usage)
+                    self.total_usage.completion_tokens += usage.completion_tokens
+                    self.total_usage.prompt_tokens += usage.prompt_tokens
+                    self.total_usage.total_tokens += usage.total_tokens
+
+                    blocks = extract_xml_data(["content"], response.choices[0].message.content)["content"]
+                    if blocks:
+                        ordered_results.append(blocks)
+                        if self.logger:
+                            self.logger.success(
+                                "Successfully processed chunk {chunk_num}", 
+                                tag="CHUNK",
+                                params={"chunk_num": i + 1}
+                            )
+                except Exception as e:
+                    if self.logger:
+                        self.logger.error(
+                            "Error processing chunk {chunk_num}: {error}", 
+                            tag="CHUNK",
+                            params={
+                                "chunk_num": i + 1,
+                                "error": str(e)
+                            }
+                        )
+
+        end_time = time.time()
+        if self.logger:
+            self.logger.success(
+                "Completed processing in {time:.2f}s", 
+                tag="LLM",
+                params={"time": end_time - start_time},
+                colors={"time": Fore.YELLOW}
+            )
+
+        result = ordered_results if ordered_results else []
+
+        # Cache the final result
+        cache_data = {
+            'blocks': result,
+            'usage': self.total_usage.__dict__
+        }
+        with cache_file.open('w') as f:
+            json.dump(cache_data, f)
+            if self.logger:
+                self.logger.info("Cached results for future use", tag="CACHE")
+
+        return result
+
+    def show_usage(self) -> None:
+        """Print usage statistics"""
+        print("\n=== Token Usage Summary ===")
+        print(f"{'Type':<15} {'Count':>12}")
+        print("-" * 30)
+        print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
+        print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
+        print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
+
+        if self.usages:
+            print("\n=== Usage History ===")
+            print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
+            print("-" * 48)
+            for i, usage in enumerate(self.usages, 1):
+                print(
+                    f"{i:<10} {usage.completion_tokens:>12,} "
+                    f"{usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
+                )
\ No newline at end of file
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index f3a96cf3..6cb169db 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -1,32 +1,61 @@
-import re  # Point 1: Pre-Compile Regular Expressions
-import time
+import re
+from itertools import chain
 from abc import ABC, abstractmethod
 from typing import Dict, Any, Optional
 from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor
-import asyncio, requests, re, os
-from .config import *
-from bs4 import element, NavigableString, Comment
+import asyncio
+import requests
+from .config import (
+    MIN_WORD_THRESHOLD,
+    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+    IMAGE_SCORE_THRESHOLD,
+    ONLY_TEXT_ELIGIBLE_TAGS,
+    IMPORTANT_ATTRS,
+    SOCIAL_MEDIA_DOMAINS,
+)
+from bs4 import NavigableString, Comment
 from bs4 import PageElement, Tag
 from urllib.parse import urljoin
 from requests.exceptions import InvalidSchema
-# from .content_cleaning_strategy import ContentCleaningStrategy
-from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter
-from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
-from .models import MarkdownGenerationResult
 from .utils import (
     extract_metadata,
     normalize_url,
-    is_external_url,    
-    get_base_domain,    
+    is_external_url,
+    get_base_domain,
+    extract_metadata_using_lxml,
 )
-
+from lxml import etree
+from lxml import html as lhtml
+from typing import List
+from .models import ScrapingResult, MediaItem, Link, Media, Links
 
 # Pre-compile regular expressions for Open Graph and Twitter metadata
-OG_REGEX = re.compile(r'^og:')
-TWITTER_REGEX = re.compile(r'^twitter:')
+OG_REGEX = re.compile(r"^og:")
+TWITTER_REGEX = re.compile(r"^twitter:")
 DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
 
+
+# Function to parse srcset
+def parse_srcset(s: str) -> List[Dict]:
+    if not s:
+        return []
+    variants = []
+    for part in s.split(","):
+        part = part.strip()
+        if not part:
+            continue
+        parts = part.split()
+        if len(parts) >= 1:
+            url = parts[0]
+            width = (
+                parts[1].rstrip("w")
+                if len(parts) > 1 and parts[1].endswith("w")
+                else None
+            )
+            variants.append({"url": url, "width": width})
+    return variants
+
+
 # Function to parse image height/width value and units
 def parse_dimension(dimension):
     if dimension:
@@ -34,39 +63,42 @@ def parse_dimension(dimension):
         match = DIMENSION_REGEX.match(dimension)
         if match:
             number = int(match.group(1))
-            unit = match.group(2) or 'px'  # Default unit is 'px' if not specified
+            unit = match.group(2) or "px"  # Default unit is 'px' if not specified
             return number, unit
     return None, None
 
+
 # Fetch image file metadata to extract size and extension
 def fetch_image_file_size(img, base_url):
-    #If src is relative path construct full URL, if not it may be CDN URL
-    img_url = urljoin(base_url,img.get('src'))
+    # If src is relative path construct full URL, if not it may be CDN URL
+    img_url = urljoin(base_url, img.get("src"))
     try:
         response = requests.head(img_url)
         if response.status_code == 200:
-            return response.headers.get('Content-Length',None)
+            return response.headers.get("Content-Length", None)
         else:
             print(f"Failed to retrieve file size for {img_url}")
             return None
-    except InvalidSchema as e:
+    except InvalidSchema:
         return None
     finally:
         return
 
+
 class ContentScrapingStrategy(ABC):
     @abstractmethod
-    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
         pass
 
     @abstractmethod
-    async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
         pass
 
+
 class WebScrapingStrategy(ContentScrapingStrategy):
     """
-    Class for web content scraping. Perhaps the most important class. 
-    
+    Class for web content scraping. Perhaps the most important class.
+
     How it works:
     1. Extract content from HTML using BeautifulSoup.
     2. Clean the extracted content using a content cleaning strategy.
@@ -74,7 +106,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
     4. Generate markdown content from the filtered content.
     5. Return the markdown content.
     """
-    
+
     def __init__(self, logger=None):
         self.logger = logger
 
@@ -83,10 +115,10 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         if self.logger:
             log_method = getattr(self.logger, level)
             log_method(message=message, tag=tag, **kwargs)
-                
-    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
         """
-        Main entry point for content scraping.  
+        Main entry point for content scraping.
 
         Args:
             url (str): The URL of the page to scrape.
@@ -94,16 +126,60 @@ class WebScrapingStrategy(ContentScrapingStrategy):
             **kwargs: Additional keyword arguments.
 
         Returns:
-            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
-
-            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
-            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
-            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
-            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
+            ScrapingResult: A structured result containing the scraped content.
         """
-        return self._scrap(url, html, is_async=False, **kwargs)
+        raw_result = self._scrap(url, html, is_async=False, **kwargs)
+        if raw_result is None:
+            return ScrapingResult(
+                cleaned_html="",
+                success=False,
+                media=Media(),
+                links=Links(),
+                metadata={},
+            )
 
-    async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+        # Convert media items
+        media = Media(
+            images=[
+                MediaItem(**img)
+                for img in raw_result.get("media", {}).get("images", [])
+                if img
+            ],
+            videos=[
+                MediaItem(**vid)
+                for vid in raw_result.get("media", {}).get("videos", [])
+                if vid
+            ],
+            audios=[
+                MediaItem(**aud)
+                for aud in raw_result.get("media", {}).get("audios", [])
+                if aud
+            ],
+        )
+
+        # Convert links
+        links = Links(
+            internal=[
+                Link(**link)
+                for link in raw_result.get("links", {}).get("internal", [])
+                if link
+            ],
+            external=[
+                Link(**link)
+                for link in raw_result.get("links", {}).get("external", [])
+                if link
+            ],
+        )
+
+        return ScrapingResult(
+            cleaned_html=raw_result.get("cleaned_html", ""),
+            success=raw_result.get("success", False),
+            media=media,
+            links=links,
+            metadata=raw_result.get("metadata", {}),
+        )
+
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
         """
         Main entry point for asynchronous content scraping.
 
@@ -113,12 +189,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
             **kwargs: Additional keyword arguments.
 
         Returns:
-            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
-
-            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
-            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
-            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
-            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
+            ScrapingResult: A structured result containing the scraped content.
         """
         return await asyncio.to_thread(self._scrap, url, html, **kwargs)
 
@@ -134,7 +205,11 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         """
         if isinstance(node, NavigableString):
             return node
-        if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name:
+        if (
+            len(node.contents) == 1
+            and isinstance(node.contents[0], Tag)
+            and node.contents[0].name == node.name
+        ):
             return self.flatten_nested_elements(node.contents[0])
         node.contents = [self.flatten_nested_elements(child) for child in node.contents]
         return node
@@ -150,23 +225,27 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         Returns:
             Tag: The closest parent with useful text, or None if not found.
         """
-        image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
+        image_description_min_word_threshold = kwargs.get(
+            "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
+        )
         current_tag = tag
         while current_tag:
             current_tag = current_tag.parent
             # Get the text content of the parent tag
             if current_tag:
-                text_content = current_tag.get_text(separator=' ',strip=True)
+                text_content = current_tag.get_text(separator=" ", strip=True)
                 # Check if the text content has at least word_count_threshold
                 if len(text_content.split()) >= image_description_min_word_threshold:
                     return text_content
         return None
 
-    def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False):
+    def remove_unwanted_attributes(
+        self, element, important_attrs, keep_data_attributes=False
+    ):
         """
         Remove unwanted attributes from an HTML element.
 
-        Args:    
+        Args:
             element (Tag): The HTML element to remove attributes from.
             important_attrs (list): List of important attributes to keep.
             keep_data_attributes (bool): Whether to keep data attributes.
@@ -178,18 +257,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         for attr in element.attrs:
             if attr not in important_attrs:
                 if keep_data_attributes:
-                    if not attr.startswith('data-'):
+                    if not attr.startswith("data-"):
                         attrs_to_remove.append(attr)
                 else:
                     attrs_to_remove.append(attr)
-        
+
         for attr in attrs_to_remove:
             del element[attr]
 
     def process_image(self, img, url, index, total_images, **kwargs):
         """
         Process an image element.
-        
+
         How it works:
         1. Check if the image has valid display and inside undesired html elements.
         2. Score an image for it's usefulness.
@@ -207,33 +286,35 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         Returns:
             dict: A dictionary containing the processed image information.
         """
-        parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') 
-                        if ' ' in u else None} 
-                        for u in [f"http{p}" for p in s.split("http") if p]]
-        
+        # parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
+        #                 if ' ' in u else None}
+        #                 for u in [f"http{p}" for p in s.split("http") if p]]
+
         # Constants for checks
-        classes_to_check = frozenset(['button', 'icon', 'logo'])
-        tags_to_check = frozenset(['button', 'input'])
-        image_formats = frozenset(['jpg', 'jpeg', 'png', 'webp', 'avif', 'gif'])
-        
+        classes_to_check = frozenset(["button", "icon", "logo"])
+        tags_to_check = frozenset(["button", "input"])
+        image_formats = frozenset(["jpg", "jpeg", "png", "webp", "avif", "gif"])
+
         # Pre-fetch commonly used attributes
-        style = img.get('style', '')
-        alt = img.get('alt', '')
-        src = img.get('src', '')
-        data_src = img.get('data-src', '')
-        srcset = img.get('srcset', '')
-        data_srcset = img.get('data-srcset', '')        
-        width = img.get('width')
-        height = img.get('height')
+        style = img.get("style", "")
+        alt = img.get("alt", "")
+        src = img.get("src", "")
+        data_src = img.get("data-src", "")
+        srcset = img.get("srcset", "")
+        data_srcset = img.get("data-srcset", "")
+        width = img.get("width")
+        height = img.get("height")
         parent = img.parent
-        parent_classes = parent.get('class', [])
+        parent_classes = parent.get("class", [])
 
         # Quick validation checks
-        if ('display:none' in style or
-            parent.name in tags_to_check or
-            any(c in cls for c in parent_classes for cls in classes_to_check) or
-            any(c in src for c in classes_to_check) or
-            any(c in alt for c in classes_to_check)):
+        if (
+            "display:none" in style
+            or parent.name in tags_to_check
+            or any(c in cls for c in parent_classes for cls in classes_to_check)
+            or any(c in src for c in classes_to_check)
+            or any(c in alt for c in classes_to_check)
+        ):
             return None
 
         # Quick score calculation
@@ -246,30 +327,29 @@ class WebScrapingStrategy(ContentScrapingStrategy):
             score += 1 if height_val > 150 else 0
         if alt:
             score += 1
-        score += index/total_images < 0.5
-        
+        score += index / total_images < 0.5
+
         # image_format = ''
         # if "data:image/" in src:
         #     image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
         # else:
         #     image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
-        
+
         # if image_format in ('jpg', 'png', 'webp', 'avif'):
         #     score += 1
-            
-            
+
         # Check for image format in all possible sources
         def has_image_format(url):
             return any(fmt in url.lower() for fmt in image_formats)
-        
+
         # Score for having proper image sources
         if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]):
             score += 1
         if srcset or data_srcset:
             score += 1
-        if img.find_parent('picture'):
+        if img.find_parent("picture"):
             score += 1
-        
+
         # Detect format from any available source
         detected_format = None
         for url in [src, data_src, srcset, data_srcset]:
@@ -277,63 +357,66 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                 format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
                 if format_matches:
                     detected_format = format_matches[0]
-                    break            
+                    break
 
-        if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
+        if score <= kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD):
             return None
 
         # Use set for deduplication
         unique_urls = set()
         image_variants = []
-        
+
         # Generate a unique group ID for this set of variants
-        group_id = index 
-        
+        group_id = index
+
         # Base image info template
-        image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
         base_info = {
-            'alt': alt,
-            'desc': self.find_closest_parent_with_useful_text(img, **kwargs),
-            'score': score,
-            'type': 'image',
-            'group_id': group_id, # Group ID for this set of variants
-            'format': detected_format,
+            "alt": alt,
+            "desc": self.find_closest_parent_with_useful_text(img, **kwargs),
+            "score": score,
+            "type": "image",
+            "group_id": group_id,  # Group ID for this set of variants
+            "format": detected_format,
         }
 
         # Inline function for adding variants
         def add_variant(src, width=None):
-            if src and not src.startswith('data:') and src not in unique_urls:
+            if src and not src.startswith("data:") and src not in unique_urls:
                 unique_urls.add(src)
-                image_variants.append({**base_info, 'src': src, 'width': width})
+                image_variants.append({**base_info, "src": src, "width": width})
 
         # Process all sources
         add_variant(src)
         add_variant(data_src)
-        
+
         # Handle srcset and data-srcset in one pass
-        for attr in ('srcset', 'data-srcset'):
+        for attr in ("srcset", "data-srcset"):
             if value := img.get(attr):
                 for source in parse_srcset(value):
-                    add_variant(source['url'], source['width'])
+                    add_variant(source["url"], source["width"])
 
         # Quick picture element check
-        if picture := img.find_parent('picture'):
-            for source in picture.find_all('source'):
-                if srcset := source.get('srcset'):
+        if picture := img.find_parent("picture"):
+            for source in picture.find_all("source"):
+                if srcset := source.get("srcset"):
                     for src in parse_srcset(srcset):
-                        add_variant(src['url'], src['width'])
+                        add_variant(src["url"], src["width"])
 
         # Framework-specific attributes in one pass
         for attr, value in img.attrs.items():
-            if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value:
+            if (
+                attr.startswith("data-")
+                and ("src" in attr or "srcset" in attr)
+                and "http" in value
+            ):
                 add_variant(value)
 
         return image_variants if image_variants else None
 
-    def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:        
+    def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
         """
         Process an HTML element.
-        
+
         How it works:
         1. Check if the element is an image, video, or audio.
         2. Extract the element's attributes and content.
@@ -348,89 +431,92 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         Returns:
             dict: A dictionary containing the processed element information.
         """
-        media = {'images': [], 'videos': [], 'audios': []}
+        media = {"images": [], "videos": [], "audios": []}
         internal_links_dict = {}
         external_links_dict = {}
         self._process_element(
-            url,
-            element,
-            media,
-            internal_links_dict,
-            external_links_dict,
-            **kwargs
+            url, element, media, internal_links_dict, external_links_dict, **kwargs
         )
         return {
-            'media': media,
-            'internal_links_dict': internal_links_dict,
-            'external_links_dict': external_links_dict
+            "media": media,
+            "internal_links_dict": internal_links_dict,
+            "external_links_dict": external_links_dict,
         }
-        
-    def _process_element(self, url, element: PageElement,  media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool:
+
+    def _process_element(
+        self,
+        url,
+        element: PageElement,
+        media: Dict[str, Any],
+        internal_links_dict: Dict[str, Any],
+        external_links_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
         """
-        Process an HTML element.        
+        Process an HTML element.
         """
         try:
             if isinstance(element, NavigableString):
                 if isinstance(element, Comment):
                     element.extract()
                 return False
-            
+
             # if element.name == 'img':
             #     process_image(element, url, 0, 1)
             #     return True
             base_domain = kwargs.get("base_domain", get_base_domain(url))
 
-            if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
+            if element.name in ["script", "style", "link", "meta", "noscript"]:
                 element.decompose()
                 return False
 
             keep_element = False
-            
-            exclude_domains = kwargs.get('exclude_domains', [])
+
+            exclude_domains = kwargs.get("exclude_domains", [])
             # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
             # exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
             # exclude_social_media_domains = list(set(exclude_social_media_domains))
-            
+
             try:
-                if element.name == 'a' and element.get('href'):
-                    href = element.get('href', '').strip()
+                if element.name == "a" and element.get("href"):
+                    href = element.get("href", "").strip()
                     if not href:  # Skip empty hrefs
                         return False
-                        
-                    url_base = url.split('/')[2]
-                    
+
+                    # url_base = url.split("/")[2]
+
                     # Normalize the URL
                     try:
                         normalized_href = normalize_url(href, url)
-                    except ValueError as e:
+                    except ValueError:
                         # logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
                         return False
-                        
+
                     link_data = {
-                        'href': normalized_href,
-                        'text': element.get_text().strip(),
-                        'title': element.get('title', '').strip(),
-                        'base_domain': base_domain
+                        "href": normalized_href,
+                        "text": element.get_text().strip(),
+                        "title": element.get("title", "").strip(),
+                        "base_domain": base_domain,
                     }
-                                        
+
                     is_external = is_external_url(normalized_href, base_domain)
-                            
+
                     keep_element = True
-                    
+
                     # Handle external link exclusions
                     if is_external:
                         link_base_domain = get_base_domain(normalized_href)
-                        link_data['base_domain'] = link_base_domain
-                        if kwargs.get('exclude_external_links', False):
+                        link_data["base_domain"] = link_base_domain
+                        if kwargs.get("exclude_external_links", False):
                             element.decompose()
                             return False
                         # elif kwargs.get('exclude_social_media_links', False):
                         #     if link_base_domain in exclude_social_media_domains:
                         #         element.decompose()
                         #         return False
-                            # if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
-                            #     element.decompose()
-                            #     return False
+                        # if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
+                        #     element.decompose()
+                        #     return False
                         elif exclude_domains:
                             if link_base_domain in exclude_domains:
                                 element.decompose()
@@ -446,32 +532,36 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                         if normalized_href not in internal_links_dict:
                             internal_links_dict[normalized_href] = link_data
 
-                                
             except Exception as e:
                 raise Exception(f"Error processing links: {str(e)}")
 
             try:
-                if element.name == 'img':
-                    potential_sources = ['src', 'data-src', 'srcset' 'data-lazy-src', 'data-original']
-                    src = element.get('src', '')
+                if element.name == "img":
+                    potential_sources = [
+                        "src",
+                        "data-src",
+                        "srcset" "data-lazy-src",
+                        "data-original",
+                    ]
+                    src = element.get("src", "")
                     while not src and potential_sources:
-                        src = element.get(potential_sources.pop(0), '')
+                        src = element.get(potential_sources.pop(0), "")
                     if not src:
                         element.decompose()
                         return False
-                    
+
                     # If it is srcset pick up the first image
-                    if 'srcset' in element.attrs:
-                        src = element.attrs['srcset'].split(',')[0].split(' ')[0]
-                        
+                    if "srcset" in element.attrs:
+                        src = element.attrs["srcset"].split(",")[0].split(" ")[0]
+
                     # If image src is internal, then skip
                     if not is_external_url(src, base_domain):
                         return True
-                    
+
                     image_src_base_domain = get_base_domain(src)
-                    
+
                     # Check flag if we should remove external images
-                    if kwargs.get('exclude_external_images', False):
+                    if kwargs.get("exclude_external_images", False):
                         element.decompose()
                         return False
                         # src_url_base = src.split('/')[2]
@@ -479,78 +569,98 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                         # if url_base not in src_url_base:
                         #     element.decompose()
                         #     return False
-                        
+
                     # if kwargs.get('exclude_social_media_links', False):
                     #     if image_src_base_domain in exclude_social_media_domains:
                     #         element.decompose()
                     #         return False
-                        # src_url_base = src.split('/')[2]
-                        # url_base = url.split('/')[2]
-                        # if any(domain in src for domain in exclude_social_media_domains):
-                        #     element.decompose()
-                        #     return False
-                        
+                    # src_url_base = src.split('/')[2]
+                    # url_base = url.split('/')[2]
+                    # if any(domain in src for domain in exclude_social_media_domains):
+                    #     element.decompose()
+                    #     return False
+
                     # Handle exclude domains
-                    if exclude_domains:                        
+                    if exclude_domains:
                         if image_src_base_domain in exclude_domains:
                             element.decompose()
                             return False
                         # if any(domain in src for domain in kwargs.get('exclude_domains', [])):
                         #     element.decompose()
                         #     return False
-                    
+
                     return True  # Always keep image elements
-            except Exception as e:
+            except Exception:
                 raise "Error processing images"
-            
-            
+
             # Check if flag to remove all forms is set
-            if kwargs.get('remove_forms', False) and element.name == 'form':
+            if kwargs.get("remove_forms", False) and element.name == "form":
                 element.decompose()
                 return False
-            
-            if element.name in ['video', 'audio']:
-                media[f"{element.name}s"].append({
-                    'src': element.get('src'),
-                    'alt': element.get('alt'),
-                    'type': element.name,
-                    'description': self.find_closest_parent_with_useful_text(element, **kwargs)
-                })
-                source_tags = element.find_all('source')
+
+            if element.name in ["video", "audio"]:
+                media[f"{element.name}s"].append(
+                    {
+                        "src": element.get("src"),
+                        "alt": element.get("alt"),
+                        "type": element.name,
+                        "description": self.find_closest_parent_with_useful_text(
+                            element, **kwargs
+                        ),
+                    }
+                )
+                source_tags = element.find_all("source")
                 for source_tag in source_tags:
-                    media[f"{element.name}s"].append({
-                    'src': source_tag.get('src'),
-                    'alt': element.get('alt'),
-                    'type': element.name,
-                    'description': self.find_closest_parent_with_useful_text(element, **kwargs)
-                })
+                    media[f"{element.name}s"].append(
+                        {
+                            "src": source_tag.get("src"),
+                            "alt": element.get("alt"),
+                            "type": element.name,
+                            "description": self.find_closest_parent_with_useful_text(
+                                element, **kwargs
+                            ),
+                        }
+                    )
                 return True  # Always keep video and audio elements
 
             if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
-                if kwargs.get('only_text', False):
+                if kwargs.get("only_text", False):
                     element.replace_with(element.get_text())
 
             try:
-                self.remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
+                self.remove_unwanted_attributes(
+                    element, IMPORTANT_ATTRS, kwargs.get("keep_data_attributes", False)
+                )
             except Exception as e:
                 # print('Error removing unwanted attributes:', str(e))
-                self._log('error',
+                self._log(
+                    "error",
                     message="Error removing unwanted attributes: {error}",
                     tag="SCRAPE",
-                    params={"error": str(e)}
+                    params={"error": str(e)},
                 )
             # Process children
             for child in list(element.children):
-                if isinstance(child, NavigableString) and not isinstance(child, Comment):
+                if isinstance(child, NavigableString) and not isinstance(
+                    child, Comment
+                ):
                     if len(child.strip()) > 0:
                         keep_element = True
                 else:
-                    if self._process_element(url, child, media, internal_links_dict, external_links_dict, **kwargs):
+                    if self._process_element(
+                        url,
+                        child,
+                        media,
+                        internal_links_dict,
+                        external_links_dict,
+                        **kwargs,
+                    ):
                         keep_element = True
-                
 
             # Check word count
-            word_count_threshold = kwargs.get('word_count_threshold', MIN_WORD_THRESHOLD)
+            word_count_threshold = kwargs.get(
+                "word_count_threshold", MIN_WORD_THRESHOLD
+            )
             if not keep_element:
                 word_count = len(element.get_text(strip=True).split())
                 keep_element = word_count >= word_count_threshold
@@ -561,14 +671,22 @@ class WebScrapingStrategy(ContentScrapingStrategy):
             return keep_element
         except Exception as e:
             # print('Error processing element:', str(e))
-            self._log('error',
+            self._log(
+                "error",
                 message="Error processing element: {error}",
                 tag="SCRAPE",
-                params={"error": str(e)}
-            )                
+                params={"error": str(e)},
+            )
             return False
 
-    def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
+    def _scrap(
+        self,
+        url: str,
+        html: str,
+        word_count_threshold: int = MIN_WORD_THRESHOLD,
+        css_selector: str = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
         """
         Extract content from HTML using BeautifulSoup.
 
@@ -586,83 +704,93 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         if not html:
             return None
 
-        parser_type = kwargs.get('parser', 'lxml')
+        parser_type = kwargs.get("parser", "lxml")
         soup = BeautifulSoup(html, parser_type)
         body = soup.body
         base_domain = get_base_domain(url)
-        
+
         try:
             meta = extract_metadata("", soup)
         except Exception as e:
-            self._log('error', 
+            self._log(
+                "error",
                 message="Error extracting metadata: {error}",
                 tag="SCRAPE",
-                params={"error": str(e)}
-            )            
+                params={"error": str(e)},
+            )
             meta = {}
-        
+
         # Handle tag-based removal first - faster than CSS selection
-        excluded_tags = set(kwargs.get('excluded_tags', []) or [])  
+        excluded_tags = set(kwargs.get("excluded_tags", []) or [])
         if excluded_tags:
             for element in body.find_all(lambda tag: tag.name in excluded_tags):
                 element.extract()
-        
+
         # Handle CSS selector-based removal
-        excluded_selector = kwargs.get('excluded_selector', '')
+        excluded_selector = kwargs.get("excluded_selector", "")
         if excluded_selector:
-            is_single_selector = ',' not in excluded_selector and ' ' not in excluded_selector
+            is_single_selector = (
+                "," not in excluded_selector and " " not in excluded_selector
+            )
             if is_single_selector:
                 while element := body.select_one(excluded_selector):
                     element.extract()
             else:
                 for element in body.select(excluded_selector):
-                    element.extract()  
-        
+                    element.extract()
+
         if css_selector:
             selected_elements = body.select(css_selector)
             if not selected_elements:
                 return {
-                    'markdown': '',
-                    'cleaned_html': '',
-                    'success': True,
-                    'media': {'images': [], 'videos': [], 'audios': []},
-                    'links': {'internal': [], 'external': []},
-                    'metadata': {},
-                    'message': f"No elements found for CSS selector: {css_selector}"
+                    "markdown": "",
+                    "cleaned_html": "",
+                    "success": True,
+                    "media": {"images": [], "videos": [], "audios": []},
+                    "links": {"internal": [], "external": []},
+                    "metadata": {},
+                    "message": f"No elements found for CSS selector: {css_selector}",
                 }
                 # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
-            body = soup.new_tag('div')
+            body = soup.new_tag("div")
             for el in selected_elements:
                 body.append(el)
 
-        kwargs['exclude_social_media_domains'] = set(kwargs.get('exclude_social_media_domains', []) + SOCIAL_MEDIA_DOMAINS)
-        kwargs['exclude_domains'] = set(kwargs.get('exclude_domains', []))
-        if kwargs.get('exclude_social_media_links', False):
-            kwargs['exclude_domains'] = kwargs['exclude_domains'].union(kwargs['exclude_social_media_domains'])
-        
-        result_obj = self.process_element(
-            url, 
-            body, 
-            word_count_threshold = word_count_threshold, 
-            base_domain=base_domain,
-            **kwargs
+        kwargs["exclude_social_media_domains"] = set(
+            kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
         )
-        
-        links = {'internal': [], 'external': []}
-        media = result_obj['media']
-        internal_links_dict = result_obj['internal_links_dict']
-        external_links_dict = result_obj['external_links_dict']
-        
+        kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))
+        if kwargs.get("exclude_social_media_links", False):
+            kwargs["exclude_domains"] = kwargs["exclude_domains"].union(
+                kwargs["exclude_social_media_domains"]
+            )
+
+        result_obj = self.process_element(
+            url,
+            body,
+            word_count_threshold=word_count_threshold,
+            base_domain=base_domain,
+            **kwargs,
+        )
+
+        links = {"internal": [], "external": []}
+        media = result_obj["media"]
+        internal_links_dict = result_obj["internal_links_dict"]
+        external_links_dict = result_obj["external_links_dict"]
+
         # Update the links dictionary with unique links
-        links['internal'] = list(internal_links_dict.values())
-        links['external'] = list(external_links_dict.values())
+        links["internal"] = list(internal_links_dict.values())
+        links["external"] = list(external_links_dict.values())
 
         # # Process images using ThreadPoolExecutor
-        imgs = body.find_all('img')
-        
-        media['images'] = [
-            img for result in (self.process_image(img, url, i, len(imgs)) 
-                            for i, img in enumerate(imgs))
+        imgs = body.find_all("img")
+
+        media["images"] = [
+            img
+            for result in (
+                self.process_image(img, url, i, len(imgs), **kwargs)
+                for i, img in enumerate(imgs)
+            )
             if result is not None
             for img in result
         ]
@@ -670,22 +798,22 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         body = self.flatten_nested_elements(body)
         base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
         for img in imgs:
-            src = img.get('src', '')
+            src = img.get("src", "")
             if base64_pattern.match(src):
                 # Replace base64 data with empty string
-                img['src'] = base64_pattern.sub('', src)
-                
+                img["src"] = base64_pattern.sub("", src)
+
         str_body = ""
         try:
-            str_body = body.encode_contents().decode('utf-8')
-        except Exception as e:
+            str_body = body.encode_contents().decode("utf-8")
+        except Exception:
             # Reset body to the original HTML
             success = False
-            body = BeautifulSoup(html, 'html.parser')
-            
+            body = BeautifulSoup(html, "html.parser")
+
             # Create a new div with a special ID
-            error_div = body.new_tag('div', id='crawl4ai_error_message')
-            error_div.string = '''
+            error_div = body.new_tag("div", id="crawl4ai_error_message")
+            error_div.string = """
             Crawl4AI Error: This page is not fully supported.
             
             Possible reasons:
@@ -698,26 +826,547 @@ class WebScrapingStrategy(ContentScrapingStrategy):
             - Set headless=False to visualize what's happening on the page.
             
             If the issue persists, please check the page's structure and any potential anti-crawling measures.
-            '''
-            
+            """
+
             # Append the error div to the body
-            body.body.append(error_div)
-            str_body = body.encode_contents().decode('utf-8')
-            
-            print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
-            self._log('error',
+            body.append(error_div)
+            str_body = body.encode_contents().decode("utf-8")
+
+            print(
+                "[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details."
+            )
+            self._log(
+                "error",
                 message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
-                tag="SCRAPE"
+                tag="SCRAPE",
             )
 
-        cleaned_html = str_body.replace('\n\n', '\n').replace('  ', ' ')
+        cleaned_html = str_body.replace("\n\n", "\n").replace("  ", " ")
 
-        
         return {
             # **markdown_content,
-            'cleaned_html': cleaned_html,
-            'success': success,
-            'media': media,
-            'links': links,
-            'metadata': meta
+            "cleaned_html": cleaned_html,
+            "success": success,
+            "media": media,
+            "links": links,
+            "metadata": meta,
         }
+
+
+class LXMLWebScrapingStrategy(WebScrapingStrategy):
+    def __init__(self, logger=None):
+        super().__init__(logger)
+        self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
+        self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
+
+    def _process_element(
+        self,
+        url: str,
+        element: lhtml.HtmlElement,
+        media: Dict[str, List],
+        internal_links_dict: Dict[str, Any],
+        external_links_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        base_domain = kwargs.get("base_domain", get_base_domain(url))
+        exclude_domains = set(kwargs.get("exclude_domains", []))
+
+        # Process links
+        for link in element.xpath(".//a[@href]"):
+            href = link.get("href", "").strip()
+            if not href:
+                continue
+
+            try:
+                normalized_href = normalize_url(href, url)
+                link_data = {
+                    "href": normalized_href,
+                    "text": link.text_content().strip(),
+                    "title": link.get("title", "").strip(),
+                    "base_domain": base_domain,
+                }
+
+                is_external = is_external_url(normalized_href, base_domain)
+                if is_external:
+                    link_base_domain = get_base_domain(normalized_href)
+                    link_data["base_domain"] = link_base_domain
+                    if (
+                        kwargs.get("exclude_external_links", False)
+                        or link_base_domain in exclude_domains
+                    ):
+                        link.getparent().remove(link)
+                        continue
+
+                    if normalized_href not in external_links_dict:
+                        external_links_dict[normalized_href] = link_data
+                else:
+                    if normalized_href not in internal_links_dict:
+                        internal_links_dict[normalized_href] = link_data
+
+            except Exception as e:
+                self._log("error", f"Error processing link: {str(e)}", "SCRAPE")
+                continue
+
+        # Process images
+        images = element.xpath(".//img")
+        total_images = len(images)
+
+        for idx, img in enumerate(images):
+            src = img.get("src") or ""
+            img_domain = get_base_domain(src)
+
+            # Decide if we need to exclude this image
+            # 1) If its domain is in exclude_domains, remove.
+            # 2) Or if exclude_external_images=True and it's an external domain, remove.
+            if (img_domain in exclude_domains) or (
+                kwargs.get("exclude_external_images", False)
+                and is_external_url(src, base_domain)
+            ):
+                parent = img.getparent()
+                if parent is not None:
+                    parent.remove(img)
+                continue
+
+            # Otherwise, process the image as usual.
+            try:
+                processed_images = self.process_image(
+                    img, url, idx, total_images, **kwargs
+                )
+                if processed_images:
+                    media["images"].extend(processed_images)
+            except Exception as e:
+                self._log("error", f"Error processing image: {str(e)}", "SCRAPE")
+
+        # Process videos and audios
+        for media_type in ["video", "audio"]:
+            for elem in element.xpath(f".//{media_type}"):
+                media_info = {
+                    "src": elem.get("src"),
+                    "alt": elem.get("alt"),
+                    "type": media_type,
+                    "description": self.find_closest_parent_with_useful_text(
+                        elem, **kwargs
+                    ),
+                }
+                media[f"{media_type}s"].append(media_info)
+
+                # Process source tags within media elements
+                for source in elem.xpath(".//source"):
+                    if src := source.get("src"):
+                        media[f"{media_type}s"].append({**media_info, "src": src})
+
+        # Clean up unwanted elements
+        if kwargs.get("remove_forms", False):
+            for form in element.xpath(".//form"):
+                form.getparent().remove(form)
+
+        if excluded_tags := kwargs.get("excluded_tags", []):
+            for tag in excluded_tags:
+                for elem in element.xpath(f".//{tag}"):
+                    elem.getparent().remove(elem)
+
+        if excluded_selector := kwargs.get("excluded_selector", ""):
+            try:
+                for elem in element.cssselect(excluded_selector):
+                    elem.getparent().remove(elem)
+            except Exception:
+                pass  # Invalid selector
+
+        return True
+
+    def find_closest_parent_with_useful_text(
+        self, element: lhtml.HtmlElement, **kwargs
+    ) -> Optional[str]:
+        image_description_min_word_threshold = kwargs.get(
+            "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
+        )
+        current = element
+        while current is not None:
+            if (
+                current.text
+                and len(current.text_content().split())
+                >= image_description_min_word_threshold
+            ):
+                return current.text_content().strip()
+            current = current.getparent()
+        return None
+
+    def flatten_nested_elements(self, element: lhtml.HtmlElement) -> lhtml.HtmlElement:
+        """Flatten nested elements of the same type in LXML tree"""
+        if len(element) == 1 and element.tag == element[0].tag:
+            return self.flatten_nested_elements(element[0])
+
+        for child in element:
+            child_idx = element.index(child)
+            flattened_child = self.flatten_nested_elements(child)
+            if flattened_child is not child:  # Only replace if actually flattened
+                element[child_idx] = flattened_child
+
+        return element
+
+    def process_image(
+        self, img: lhtml.HtmlElement, url: str, index: int, total_images: int, **kwargs
+    ) -> Optional[List[Dict]]:
+        # Quick validation checks
+        style = img.get("style", "")
+        alt = img.get("alt", "")
+        src = img.get("src", "")
+        data_src = img.get("data-src", "")
+        srcset = img.get("srcset", "")
+        data_srcset = img.get("data-srcset", "")
+
+        if "display:none" in style:
+            return None
+
+        parent = img.getparent()
+        if parent.tag in ["button", "input"]:
+            return None
+
+        parent_classes = parent.get("class", "").split()
+        if any(
+            "button" in cls or "icon" in cls or "logo" in cls for cls in parent_classes
+        ):
+            return None
+
+        # If src is in class or alt, likely an icon
+        if (src and any(c in src for c in ["button", "icon", "logo"])) or (
+            alt and any(c in alt for c in ["button", "icon", "logo"])
+        ):
+            return None
+
+        # Score calculation
+        score = 0
+        if (width := img.get("width")) and width.isdigit():
+            score += 1 if int(width) > 150 else 0
+        if (height := img.get("height")) and height.isdigit():
+            score += 1 if int(height) > 150 else 0
+        if alt:
+            score += 1
+        score += index / total_images < 0.5
+
+        # Check formats in all possible sources
+        image_formats = {"jpg", "jpeg", "png", "webp", "avif", "gif"}
+        detected_format = None
+        for url in [src, data_src, srcset, data_srcset]:
+            if url:
+                format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
+                if format_matches:
+                    detected_format = format_matches[0]
+                    score += 1
+                    break
+
+        if srcset or data_srcset:
+            score += 1
+
+        if picture := img.xpath("./ancestor::picture[1]"):
+            score += 1
+
+        if score <= kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD):
+            return None
+
+        # Process image variants
+        unique_urls = set()
+        image_variants = []
+        base_info = {
+            "alt": alt,
+            "desc": self.find_closest_parent_with_useful_text(img, **kwargs),
+            "score": score,
+            "type": "image",
+            "group_id": index,
+            "format": detected_format,
+        }
+
+        def add_variant(src: str, width: Optional[str] = None):
+            if src and not src.startswith("data:") and src not in unique_urls:
+                unique_urls.add(src)
+                variant = {**base_info, "src": src}
+                if width:
+                    variant["width"] = width
+                image_variants.append(variant)
+
+        # Add variants from different sources
+        add_variant(src)
+        add_variant(data_src)
+
+        for srcset_attr in [srcset, data_srcset]:
+            if srcset_attr:
+                for source in parse_srcset(srcset_attr):
+                    add_variant(source["url"], source["width"])
+
+        # Handle picture element
+        if picture:
+            for source in picture[0].xpath(".//source[@srcset]"):
+                if source_srcset := source.get("srcset"):
+                    for src_data in parse_srcset(source_srcset):
+                        add_variant(src_data["url"], src_data["width"])
+
+        # Check framework-specific attributes
+        for attr, value in img.attrib.items():
+            if (
+                attr.startswith("data-")
+                and ("src" in attr or "srcset" in attr)
+                and "http" in value
+            ):
+                add_variant(value)
+
+        return image_variants if image_variants else None
+
+    def remove_empty_elements_fast(self, root, word_count_threshold=5):
+        """
+        Remove elements that fall below the desired word threshold in a single pass from the bottom up.
+        Skips non-element nodes like HtmlComment and bypasses certain tags that are allowed to have no content.
+        """
+        bypass_tags = {
+            "a",
+            "img",
+            "br",
+            "hr",
+            "input",
+            "meta",
+            "link",
+            "source",
+            "track",
+            "wbr",
+        }
+
+        for el in reversed(list(root.iterdescendants())):
+            if not isinstance(el, lhtml.HtmlElement):
+                continue
+
+            if el.tag in bypass_tags:
+                continue
+
+            text_content = (el.text_content() or "").strip()
+            if (
+                len(text_content.split()) < word_count_threshold
+                and not el.getchildren()
+            ):
+                parent = el.getparent()
+                if parent is not None:
+                    parent.remove(el)
+
+        return root
+
+    def remove_unwanted_attributes_fast(
+        self, root: lhtml.HtmlElement, important_attrs=None, keep_data_attributes=False
+    ) -> lhtml.HtmlElement:
+        """
+        Removes all attributes from each element (including root) except those in `important_attrs`.
+        If `keep_data_attributes=True`, also retain any attribute starting with 'data-'.
+
+        Returns the same root element, mutated in-place, for fluent usage.
+        """
+        if important_attrs is None:
+            important_attrs = set(IMPORTANT_ATTRS)
+
+        # If you want to handle the root as well, use 'include_self=True'
+        # so you don't miss attributes on the top-level element.
+        # Manually include the root, then all its descendants
+        for el in chain((root,), root.iterdescendants()):
+            # We only remove attributes on HtmlElement nodes, skip comments or text nodes
+            if not isinstance(el, lhtml.HtmlElement):
+                continue
+
+            old_attribs = dict(el.attrib)
+            new_attribs = {}
+
+            for attr_name, attr_val in old_attribs.items():
+                # If it's an important attribute, keep it
+                if attr_name in important_attrs:
+                    new_attribs[attr_name] = attr_val
+                # Or if keep_data_attributes is True and it's a 'data-*' attribute
+                elif keep_data_attributes and attr_name.startswith("data-"):
+                    new_attribs[attr_name] = attr_val
+
+            # Clear old attributes and set the filtered set
+            el.attrib.clear()
+            el.attrib.update(new_attribs)
+
+        return root
+
+    def _scrap(
+        self,
+        url: str,
+        html: str,
+        word_count_threshold: int = MIN_WORD_THRESHOLD,
+        css_selector: str = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        if not html:
+            return None
+
+        success = True
+        try:
+            doc = lhtml.document_fromstring(html)
+            # Match BeautifulSoup's behavior of using body or full doc
+            # body = doc.xpath('//body')[0] if doc.xpath('//body') else doc
+            body = doc
+
+            base_domain = get_base_domain(url)
+
+            # Add comment removal
+            if kwargs.get("remove_comments", False):
+                comments = body.xpath("//comment()")
+                for comment in comments:
+                    comment.getparent().remove(comment)
+
+            # Handle tag-based removal first
+            excluded_tags = set(kwargs.get("excluded_tags", []) or [])
+            if excluded_tags:
+                for tag in excluded_tags:
+                    for element in body.xpath(f".//{tag}"):
+                        if element.getparent() is not None:
+                            element.getparent().remove(element)
+
+            # Handle CSS selector-based exclusion
+            excluded_selector = kwargs.get("excluded_selector", "")
+            if excluded_selector:
+                try:
+                    for element in body.cssselect(excluded_selector):
+                        if element.getparent() is not None:
+                            element.getparent().remove(element)
+                except Exception as e:
+                    self._log(
+                        "error", f"Error with excluded CSS selector: {str(e)}", "SCRAPE"
+                    )
+
+            # Extract metadata before any content filtering
+            try:
+                meta = extract_metadata_using_lxml(
+                    "", doc
+                )  # Using same function as BeautifulSoup version
+            except Exception as e:
+                self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
+                meta = {}
+
+            # Handle CSS selector targeting
+            if css_selector:
+                try:
+                    selected_elements = body.cssselect(css_selector)
+                    if not selected_elements:
+                        return {
+                            "markdown": "",
+                            "cleaned_html": "",
+                            "success": True,
+                            "media": {"images": [], "videos": [], "audios": []},
+                            "links": {"internal": [], "external": []},
+                            "metadata": meta,
+                            "message": f"No elements found for CSS selector: {css_selector}",
+                        }
+                    body = lhtml.Element("div")
+                    body.extend(selected_elements)
+                except Exception as e:
+                    self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
+                    return None
+
+            # Remove script and style tags
+            for tag in ["script", "style", "link", "meta", "noscript"]:
+                for element in body.xpath(f".//{tag}"):
+                    if element.getparent() is not None:
+                        element.getparent().remove(element)
+
+            # Handle social media and domain exclusions
+            kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))
+            if kwargs.get("exclude_social_media_links", False):
+                kwargs["exclude_social_media_domains"] = set(
+                    kwargs.get("exclude_social_media_domains", [])
+                    + SOCIAL_MEDIA_DOMAINS
+                )
+                kwargs["exclude_domains"].update(kwargs["exclude_social_media_domains"])
+
+            # Process forms if needed
+            if kwargs.get("remove_forms", False):
+                for form in body.xpath(".//form"):
+                    if form.getparent() is not None:
+                        form.getparent().remove(form)
+
+            # Process content
+            media = {"images": [], "videos": [], "audios": []}
+            internal_links_dict = {}
+            external_links_dict = {}
+
+            self._process_element(
+                url,
+                body,
+                media,
+                internal_links_dict,
+                external_links_dict,
+                base_domain=base_domain,
+                **kwargs,
+            )
+
+            # Handle only_text option
+            if kwargs.get("only_text", False):
+                for tag in ONLY_TEXT_ELIGIBLE_TAGS:
+                    for element in body.xpath(f".//{tag}"):
+                        if element.text:
+                            new_text = lhtml.Element("span")
+                            new_text.text = element.text_content()
+                            if element.getparent() is not None:
+                                element.getparent().replace(element, new_text)
+
+            # Clean base64 images
+            for img in body.xpath(".//img[@src]"):
+                src = img.get("src", "")
+                if self.BASE64_PATTERN.match(src):
+                    img.set("src", self.BASE64_PATTERN.sub("", src))
+
+            # Remove empty elements
+            self.remove_empty_elements_fast(body, 1)
+
+            # Remvoe unneeded attributes
+            self.remove_unwanted_attributes_fast(
+                body, keep_data_attributes=kwargs.get("keep_data_attributes", False)
+            )
+
+            # Generate output HTML
+            cleaned_html = lhtml.tostring(
+                body,
+                encoding="unicode",
+                pretty_print=True,
+                method="html",
+                with_tail=False,
+            ).strip()
+            return {
+                "cleaned_html": cleaned_html,
+                "success": success,
+                "media": media,
+                "links": {
+                    "internal": list(internal_links_dict.values()),
+                    "external": list(external_links_dict.values()),
+                },
+                "metadata": meta,
+            }
+
+        except Exception as e:
+            self._log("error", f"Error processing HTML: {str(e)}", "SCRAPE")
+            # Create error message in case of failure
+            error_body = lhtml.Element("div")
+            # Use etree.SubElement rather than lhtml.SubElement
+            error_div = etree.SubElement(error_body, "div", id="crawl4ai_error_message")
+            error_div.text = f"""
+            Crawl4AI Error: This page is not fully supported.
+            
+            Error Message: {str(e)}
+            
+            Possible reasons:
+            1. The page may have restrictions that prevent crawling.
+            2. The page might not be fully loaded.
+            
+            Suggestions:
+            - Try calling the crawl function with these parameters:
+            magic=True,
+            - Set headless=False to visualize what's happening on the page.
+            
+            If the issue persists, please check the page's structure and any potential anti-crawling measures.
+            """
+            cleaned_html = lhtml.tostring(
+                error_body, encoding="unicode", pretty_print=True
+            )
+            return {
+                "cleaned_html": cleaned_html,
+                "success": False,
+                "media": {"images": [], "videos": [], "audios": []},
+                "links": {"internal": [], "external": []},
+                "metadata": {},
+            }
diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
index 898dcfa8..34e20ecd 100644
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -15,54 +15,53 @@ import logging, time
 import base64
 from PIL import Image, ImageDraw, ImageFont
 from io import BytesIO
-from typing import List, Callable
+from typing import Callable
 import requests
 import os
 from pathlib import Path
 from .utils import *
 
-logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
+logger = logging.getLogger("selenium.webdriver.remote.remote_connection")
 logger.setLevel(logging.WARNING)
 
-logger_driver = logging.getLogger('selenium.webdriver.common.service')
+logger_driver = logging.getLogger("selenium.webdriver.common.service")
 logger_driver.setLevel(logging.WARNING)
 
-urllib3_logger = logging.getLogger('urllib3.connectionpool')
+urllib3_logger = logging.getLogger("urllib3.connectionpool")
 urllib3_logger.setLevel(logging.WARNING)
 
 # Disable http.client logging
-http_client_logger = logging.getLogger('http.client')
+http_client_logger = logging.getLogger("http.client")
 http_client_logger.setLevel(logging.WARNING)
 
 # Disable driver_finder and service logging
-driver_finder_logger = logging.getLogger('selenium.webdriver.common.driver_finder')
+driver_finder_logger = logging.getLogger("selenium.webdriver.common.driver_finder")
 driver_finder_logger.setLevel(logging.WARNING)
 
 
-
-
 class CrawlerStrategy(ABC):
     @abstractmethod
     def crawl(self, url: str, **kwargs) -> str:
         pass
-    
+
     @abstractmethod
     def take_screenshot(self, save_path: str):
         pass
-    
+
     @abstractmethod
     def update_user_agent(self, user_agent: str):
         pass
-    
+
     @abstractmethod
     def set_hook(self, hook_type: str, hook: Callable):
         pass
 
+
 class CloudCrawlerStrategy(CrawlerStrategy):
-    def __init__(self, use_cached_html = False):
+    def __init__(self, use_cached_html=False):
         super().__init__()
         self.use_cached_html = use_cached_html
-        
+
     def crawl(self, url: str) -> str:
         data = {
             "urls": [url],
@@ -76,6 +75,7 @@ class CloudCrawlerStrategy(CrawlerStrategy):
         html = response["results"][0]["html"]
         return sanitize_input_encode(html)
 
+
 class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
     def __init__(self, use_cached_html=False, js_code=None, **kwargs):
         super().__init__()
@@ -87,20 +87,25 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
         if kwargs.get("user_agent"):
             self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
         else:
-            user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
+            user_agent = kwargs.get(
+                "user_agent",
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            )
             self.options.add_argument(f"--user-agent={user_agent}")
-            self.options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
-                  
+            self.options.add_argument(
+                "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+            )
+
         self.options.headless = kwargs.get("headless", True)
         if self.options.headless:
             self.options.add_argument("--headless")
-        
-        self.options.add_argument("--disable-gpu")  
+
+        self.options.add_argument("--disable-gpu")
         self.options.add_argument("--window-size=1920,1080")
         self.options.add_argument("--no-sandbox")
         self.options.add_argument("--disable-dev-shm-usage")
-        self.options.add_argument("--disable-blink-features=AutomationControlled")     
-        
+        self.options.add_argument("--disable-blink-features=AutomationControlled")
+
         # self.options.add_argument("--disable-dev-shm-usage")
         self.options.add_argument("--disable-gpu")
         # self.options.add_argument("--disable-extensions")
@@ -120,14 +125,14 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
         self.use_cached_html = use_cached_html
         self.js_code = js_code
         self.verbose = kwargs.get("verbose", False)
-        
+
         # Hooks
         self.hooks = {
-            'on_driver_created': None,
-            'on_user_agent_updated': None,
-            'before_get_url': None,
-            'after_get_url': None,
-            'before_return_html': None
+            "on_driver_created": None,
+            "on_user_agent_updated": None,
+            "before_get_url": None,
+            "after_get_url": None,
+            "before_return_html": None,
         }
 
         # chromedriver_autoinstaller.install()
@@ -137,31 +142,28 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
         # chromedriver_path = chromedriver_autoinstaller.install()
         # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
         # self.service = Service(chromedriver_autoinstaller.install())
-        
-        
+
         # chromedriver_path = ChromeDriverManager().install()
         # self.service = Service(chromedriver_path)
         # self.service.log_path = "NUL"
         # self.driver = webdriver.Chrome(service=self.service, options=self.options)
-        
+
         # Use selenium-manager (built into Selenium 4.10.0+)
         self.service = Service()
         self.driver = webdriver.Chrome(options=self.options)
-        
-        self.driver = self.execute_hook('on_driver_created', self.driver)
-        
+
+        self.driver = self.execute_hook("on_driver_created", self.driver)
+
         if kwargs.get("cookies"):
             for cookie in kwargs.get("cookies"):
                 self.driver.add_cookie(cookie)
-            
-        
 
     def set_hook(self, hook_type: str, hook: Callable):
         if hook_type in self.hooks:
             self.hooks[hook_type] = hook
         else:
             raise ValueError(f"Invalid hook type: {hook_type}")
-    
+
     def execute_hook(self, hook_type: str, *args):
         hook = self.hooks.get(hook_type)
         if hook:
@@ -170,7 +172,9 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
                 if isinstance(result, webdriver.Chrome):
                     return result
                 else:
-                    raise TypeError(f"Hook {hook_type} must return an instance of webdriver.Chrome or None.")
+                    raise TypeError(
+                        f"Hook {hook_type} must return an instance of webdriver.Chrome or None."
+                    )
         # If the hook returns None or there is no hook, return self.driver
         return self.driver
 
@@ -178,60 +182,77 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
         self.options.add_argument(f"user-agent={user_agent}")
         self.driver.quit()
         self.driver = webdriver.Chrome(service=self.service, options=self.options)
-        self.driver = self.execute_hook('on_user_agent_updated', self.driver)
+        self.driver = self.execute_hook("on_user_agent_updated", self.driver)
 
     def set_custom_headers(self, headers: dict):
         # Enable Network domain for sending headers
-        self.driver.execute_cdp_cmd('Network.enable', {})
+        self.driver.execute_cdp_cmd("Network.enable", {})
         # Set extra HTTP headers
-        self.driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
+        self.driver.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": headers})
 
-    def _ensure_page_load(self,  max_checks=6, check_interval=0.01):
+    def _ensure_page_load(self, max_checks=6, check_interval=0.01):
         initial_length = len(self.driver.page_source)
-        
+
         for ix in range(max_checks):
             # print(f"Checking page load: {ix}")
             time.sleep(check_interval)
             current_length = len(self.driver.page_source)
-            
+
             if current_length != initial_length:
                 break
 
         return self.driver.page_source
-    
+
     def crawl(self, url: str, **kwargs) -> str:
         # Create md5 hash of the URL
         import hashlib
+
         url_hash = hashlib.md5(url.encode()).hexdigest()
-        
+
         if self.use_cached_html:
-            cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
+            cache_file_path = os.path.join(
+                os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()),
+                ".crawl4ai",
+                "cache",
+                url_hash,
+            )
             if os.path.exists(cache_file_path):
                 with open(cache_file_path, "r") as f:
                     return sanitize_input_encode(f.read())
 
         try:
-            self.driver = self.execute_hook('before_get_url', self.driver)
+            self.driver = self.execute_hook("before_get_url", self.driver)
             if self.verbose:
                 print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
-            self.driver.get(url) #<html><head></head><body></body></html>
-            
+            self.driver.get(url)  # <html><head></head><body></body></html>
+
             WebDriverWait(self.driver, 20).until(
-                lambda d: d.execute_script('return document.readyState') == 'complete'
+                lambda d: d.execute_script("return document.readyState") == "complete"
             )
             WebDriverWait(self.driver, 10).until(
                 EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
             )
-            
-            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
-            
-            self.driver = self.execute_hook('after_get_url', self.driver)
-            html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source                                        
-            can_not_be_done_headless = False # Look at my creativity for naming variables
-            
+
+            self.driver.execute_script(
+                "window.scrollTo(0, document.body.scrollHeight);"
+            )
+
+            self.driver = self.execute_hook("after_get_url", self.driver)
+            html = sanitize_input_encode(
+                self._ensure_page_load()
+            )  # self.driver.page_source
+            can_not_be_done_headless = (
+                False  # Look at my creativity for naming variables
+            )
+
             # TODO: Very ugly approach, but promise to change it!
-            if kwargs.get('bypass_headless', False) or html == "<html><head></head><body></body></html>":
-                print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
+            if (
+                kwargs.get("bypass_headless", False)
+                or html == "<html><head></head><body></body></html>"
+            ):
+                print(
+                    "[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode..."
+                )
                 can_not_be_done_headless = True
                 options = Options()
                 options.headless = False
@@ -239,27 +260,31 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
                 options.add_argument("--window-size=5,5")
                 driver = webdriver.Chrome(service=self.service, options=options)
                 driver.get(url)
-                self.driver = self.execute_hook('after_get_url', driver)
+                self.driver = self.execute_hook("after_get_url", driver)
                 html = sanitize_input_encode(driver.page_source)
                 driver.quit()
-            
+
             # Execute JS code if provided
             self.js_code = kwargs.get("js_code", self.js_code)
             if self.js_code and type(self.js_code) == str:
                 self.driver.execute_script(self.js_code)
                 # Optionally, wait for some condition after executing the JS code
                 WebDriverWait(self.driver, 10).until(
-                    lambda driver: driver.execute_script("return document.readyState") == "complete"
+                    lambda driver: driver.execute_script("return document.readyState")
+                    == "complete"
                 )
             elif self.js_code and type(self.js_code) == list:
                 for js in self.js_code:
                     self.driver.execute_script(js)
                     WebDriverWait(self.driver, 10).until(
-                        lambda driver: driver.execute_script("return document.readyState") == "complete"
+                        lambda driver: driver.execute_script(
+                            "return document.readyState"
+                        )
+                        == "complete"
                     )
-            
+
             # Optionally, wait for some condition after executing the JS code : Contributed by (https://github.com/jonymusky)
-            wait_for = kwargs.get('wait_for', False)
+            wait_for = kwargs.get("wait_for", False)
             if wait_for:
                 if callable(wait_for):
                     print("[LOG] 🔄 Waiting for condition...")
@@ -268,32 +293,37 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
                     print("[LOG] 🔄 Waiting for condition...")
                     WebDriverWait(self.driver, 20).until(
                         EC.presence_of_element_located((By.CSS_SELECTOR, wait_for))
-                    ) 
-            
+                    )
+
             if not can_not_be_done_headless:
                 html = sanitize_input_encode(self.driver.page_source)
-            self.driver = self.execute_hook('before_return_html', self.driver, html)
-            
+            self.driver = self.execute_hook("before_return_html", self.driver, html)
+
             # Store in cache
-            cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
+            cache_file_path = os.path.join(
+                os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()),
+                ".crawl4ai",
+                "cache",
+                url_hash,
+            )
             with open(cache_file_path, "w", encoding="utf-8") as f:
                 f.write(html)
-                
+
             if self.verbose:
                 print(f"[LOG] ✅ Crawled {url} successfully!")
-            
+
             return html
         except InvalidArgumentException as e:
-            if not hasattr(e, 'msg'):
+            if not hasattr(e, "msg"):
                 e.msg = sanitize_input_encode(str(e))
             raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
         except WebDriverException as e:
             # If e does nlt have msg attribute create it and set it to str(e)
-            if not hasattr(e, 'msg'):
+            if not hasattr(e, "msg"):
                 e.msg = sanitize_input_encode(str(e))
-            raise WebDriverException(f"Failed to crawl {url}: {e.msg}")  
+            raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
         except Exception as e:
-            if not hasattr(e, 'msg'):
+            if not hasattr(e, "msg"):
                 e.msg = sanitize_input_encode(str(e))
             raise Exception(f"Failed to crawl {url}: {e.msg}")
 
@@ -301,7 +331,9 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
         try:
             # Get the dimensions of the page
             total_width = self.driver.execute_script("return document.body.scrollWidth")
-            total_height = self.driver.execute_script("return document.body.scrollHeight")
+            total_height = self.driver.execute_script(
+                "return document.body.scrollHeight"
+            )
 
             # Set the window size to the dimensions of the page
             self.driver.set_window_size(total_width, total_height)
@@ -313,25 +345,27 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
             image = Image.open(BytesIO(screenshot))
 
             # Convert image to RGB mode (this will handle both RGB and RGBA images)
-            rgb_image = image.convert('RGB')
+            rgb_image = image.convert("RGB")
 
             # Convert to JPEG and compress
             buffered = BytesIO()
             rgb_image.save(buffered, format="JPEG", quality=85)
-            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+            img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
 
             if self.verbose:
-                print(f"[LOG] 📸 Screenshot taken and converted to base64")
+                print("[LOG] 📸 Screenshot taken and converted to base64")
 
             return img_base64
         except Exception as e:
-            error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
+            error_message = sanitize_input_encode(
+                f"Failed to take screenshot: {str(e)}"
+            )
             print(error_message)
 
             # Generate an image with black background
-            img = Image.new('RGB', (800, 600), color='black')
+            img = Image.new("RGB", (800, 600), color="black")
             draw = ImageDraw.Draw(img)
-            
+
             # Load a font
             try:
                 font = ImageFont.truetype("arial.ttf", 40)
@@ -345,16 +379,16 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
 
             # Calculate text position
             text_position = (10, 10)
-            
+
             # Draw the text on the image
             draw.text(text_position, wrapped_text, fill=text_color, font=font)
-            
+
             # Convert to base64
             buffered = BytesIO()
             img.save(buffered, format="JPEG")
-            img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
+            img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
 
             return img_base64
-        
+
     def quit(self):
         self.driver.quit()
diff --git a/crawl4ai/database.py b/crawl4ai/database.py
index 42ad7017..815b6b05 100644
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -7,11 +7,13 @@ DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".cra
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
 
+
 def init_db():
     global DB_PATH
     conn = sqlite3.connect(DB_PATH)
     cursor = conn.cursor()
-    cursor.execute('''
+    cursor.execute(
+        """
         CREATE TABLE IF NOT EXISTS crawled_data (
             url TEXT PRIMARY KEY,
             html TEXT,
@@ -24,31 +26,42 @@ def init_db():
             metadata TEXT DEFAULT "{}",
             screenshot TEXT DEFAULT ""
         )
-    ''')
+    """
+    )
     conn.commit()
     conn.close()
 
+
 def alter_db_add_screenshot(new_column: str = "media"):
     check_db_path()
     try:
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
-        cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
+        cursor.execute(
+            f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""'
+        )
         conn.commit()
         conn.close()
     except Exception as e:
         print(f"Error altering database to add screenshot column: {e}")
 
+
 def check_db_path():
     if not DB_PATH:
         raise ValueError("Database path is not set or is empty.")
 
-def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
+
+def get_cached_url(
+    url: str,
+) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
     check_db_path()
     try:
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
-        cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,))
+        cursor.execute(
+            "SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?",
+            (url,),
+        )
         result = cursor.fetchone()
         conn.close()
         return result
@@ -56,12 +69,25 @@ def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str
         print(f"Error retrieving cached URL: {e}")
         return None
 
-def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""):
+
+def cache_url(
+    url: str,
+    html: str,
+    cleaned_html: str,
+    markdown: str,
+    extracted_content: str,
+    success: bool,
+    media: str = "{}",
+    links: str = "{}",
+    metadata: str = "{}",
+    screenshot: str = "",
+):
     check_db_path()
     try:
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
-        cursor.execute('''
+        cursor.execute(
+            """
             INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
             VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             ON CONFLICT(url) DO UPDATE SET
@@ -74,18 +100,32 @@ def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_c
                 links = excluded.links,    
                 metadata = excluded.metadata,      
                 screenshot = excluded.screenshot
-        ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
+        """,
+            (
+                url,
+                html,
+                cleaned_html,
+                markdown,
+                extracted_content,
+                success,
+                media,
+                links,
+                metadata,
+                screenshot,
+            ),
+        )
         conn.commit()
         conn.close()
     except Exception as e:
         print(f"Error caching URL: {e}")
 
+
 def get_total_count() -> int:
     check_db_path()
     try:
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
-        cursor.execute('SELECT COUNT(*) FROM crawled_data')
+        cursor.execute("SELECT COUNT(*) FROM crawled_data")
         result = cursor.fetchone()
         conn.close()
         return result[0]
@@ -93,43 +133,48 @@ def get_total_count() -> int:
         print(f"Error getting total count: {e}")
         return 0
 
+
 def clear_db():
     check_db_path()
     try:
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
-        cursor.execute('DELETE FROM crawled_data')
+        cursor.execute("DELETE FROM crawled_data")
         conn.commit()
         conn.close()
     except Exception as e:
         print(f"Error clearing database: {e}")
-        
+
+
 def flush_db():
     check_db_path()
     try:
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
-        cursor.execute('DROP TABLE crawled_data')
+        cursor.execute("DROP TABLE crawled_data")
         conn.commit()
         conn.close()
     except Exception as e:
         print(f"Error flushing database: {e}")
 
+
 def update_existing_records(new_column: str = "media", default_value: str = "{}"):
     check_db_path()
     try:
         conn = sqlite3.connect(DB_PATH)
         cursor = conn.cursor()
-        cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL')
+        cursor.execute(
+            f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL'
+        )
         conn.commit()
         conn.close()
     except Exception as e:
         print(f"Error updating existing records: {e}")
 
+
 if __name__ == "__main__":
     # Delete the existing database file
     if os.path.exists(DB_PATH):
         os.remove(DB_PATH)
-    init_db()  
+    init_db()
     # alter_db_add_screenshot("COL_NAME")
-    
diff --git a/crawl4ai/docs_manager.py b/crawl4ai/docs_manager.py
index aacc5812..9a6096a5 100644
--- a/crawl4ai/docs_manager.py
+++ b/crawl4ai/docs_manager.py
@@ -4,6 +4,7 @@ from pathlib import Path
 from crawl4ai.async_logger import AsyncLogger
 from crawl4ai.llmtxt import AsyncLLMTextManager
 
+
 class DocsManager:
     def __init__(self, logger=None):
         self.docs_dir = Path.home() / ".crawl4ai" / "docs"
@@ -21,11 +22,14 @@ class DocsManager:
         """Copy from local docs or download from GitHub"""
         try:
             # Try local first
-            if self.local_docs.exists() and (any(self.local_docs.glob("*.md")) or any(self.local_docs.glob("*.tokens"))):
+            if self.local_docs.exists() and (
+                any(self.local_docs.glob("*.md"))
+                or any(self.local_docs.glob("*.tokens"))
+            ):
                 # Empty the local docs directory
                 for file_path in self.docs_dir.glob("*.md"):
                     file_path.unlink()
-                # for file_path in self.docs_dir.glob("*.tokens"): 
+                # for file_path in self.docs_dir.glob("*.tokens"):
                 #     file_path.unlink()
                 for file_path in self.local_docs.glob("*.md"):
                     shutil.copy2(file_path, self.docs_dir / file_path.name)
@@ -36,14 +40,14 @@ class DocsManager:
             # Fallback to GitHub
             response = requests.get(
                 "https://api.github.com/repos/unclecode/crawl4ai/contents/docs/llm.txt",
-                headers={'Accept': 'application/vnd.github.v3+json'}
+                headers={"Accept": "application/vnd.github.v3+json"},
             )
             response.raise_for_status()
-            
+
             for item in response.json():
-                if item['type'] == 'file' and item['name'].endswith('.md'):
-                    content = requests.get(item['download_url']).text
-                    with open(self.docs_dir / item['name'], 'w', encoding='utf-8') as f:
+                if item["type"] == "file" and item["name"].endswith(".md"):
+                    content = requests.get(item["download_url"]).text
+                    with open(self.docs_dir / item["name"], "w", encoding="utf-8") as f:
                         f.write(content)
             return True
 
@@ -57,11 +61,15 @@ class DocsManager:
         # Remove [0-9]+_ prefix
         names = [name.split("_", 1)[1] if name[0].isdigit() else name for name in names]
         # Exclude those end with .xs.md and .q.md
-        names = [name for name in names if not name.endswith(".xs") and not name.endswith(".q")]
+        names = [
+            name
+            for name in names
+            if not name.endswith(".xs") and not name.endswith(".q")
+        ]
         return names
-    
+
     def generate(self, sections, mode="extended"):
         return self.llm_text.generate(sections, mode)
-    
+
     def search(self, query: str, top_k: int = 5):
-        return self.llm_text.search(query, top_k)
\ No newline at end of file
+        return self.llm_text.search(query, top_k)
diff --git a/crawl4ai/extraction_strategy.bak.py b/crawl4ai/extraction_strategy.bak.py
deleted file mode 100644
index 2048c0ff..00000000
--- a/crawl4ai/extraction_strategy.bak.py
+++ /dev/null
@@ -1,1440 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Any, List, Dict, Optional, Union
-from concurrent.futures import ThreadPoolExecutor, as_completed
-import json, time
-# from optimum.intel import IPEXModel
-from .prompts import *
-from .config import *
-from .utils import *
-from .models import *
-from functools import partial
-from .model_loader import *
-import math
-import numpy as np
-import re
-from bs4 import BeautifulSoup
-from lxml import html, etree
-from dataclasses import dataclass
-
-class ExtractionStrategy(ABC):
-    """
-    Abstract base class for all extraction strategies.
-    """
-    
-    def __init__(self, input_format: str = "markdown", **kwargs):
-        """
-        Initialize the extraction strategy.
-
-        Args:
-            input_format: Content format to use for extraction.
-                         Options: "markdown" (default), "html", "fit_markdown"
-            **kwargs: Additional keyword arguments
-        """
-        self.input_format = input_format
-        self.DEL = "<|DEL|>"
-        self.name = self.__class__.__name__
-        self.verbose = kwargs.get("verbose", False)
-
-    @abstractmethod
-    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Extract meaningful blocks or chunks from the given HTML.
-
-        :param url: The URL of the webpage.
-        :param html: The HTML content of the webpage.
-        :return: A list of extracted blocks or chunks.
-        """
-        pass
-    
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Process sections of text in parallel by default.
-
-        :param url: The URL of the webpage.
-        :param sections: List of sections (strings) to process.
-        :return: A list of processed JSON blocks.
-        """
-        extracted_content = []
-        with ThreadPoolExecutor() as executor:
-            futures = [executor.submit(self.extract, url, section, **kwargs) for section in sections]
-            for future in as_completed(futures):
-                extracted_content.extend(future.result())
-        return extracted_content    
-    
-class NoExtractionStrategy(ExtractionStrategy):
-    """
-    A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
-    """
-    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Extract meaningful blocks or chunks from the given HTML.
-        """
-        return [{"index": 0, "content": html}]
-    
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
-
-#######################################################
-# Strategies using LLM-based extraction for text data #
-#######################################################
-class LLMExtractionStrategy(ExtractionStrategy):
-    """
-    A strategy that uses an LLM to extract meaningful content from the HTML.
-    
-    Attributes:
-        provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
-        api_token: The API token for the provider.
-        instruction: The instruction to use for the LLM model.  
-        schema: Pydantic model schema for structured data.
-        extraction_type: "block" or "schema".
-        chunk_token_threshold: Maximum tokens per chunk.
-        overlap_rate: Overlap between chunks.
-        word_token_rate: Word to token conversion rate.
-        apply_chunking: Whether to apply chunking.
-        base_url: The base URL for the API request.
-        api_base: The base URL for the API request.
-        extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
-        verbose: Whether to print verbose output.
-        usages: List of individual token usages.
-        total_usage: Accumulated token usage.
-    """
-
-    def __init__(self, 
-                 provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, 
-                 instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
-        """
-        Initialize the strategy with clustering parameters.
-        
-        Args:
-            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
-            api_token: The API token for the provider.
-            instruction: The instruction to use for the LLM model.  
-            schema: Pydantic model schema for structured data.
-            extraction_type: "block" or "schema".
-            chunk_token_threshold: Maximum tokens per chunk.
-            overlap_rate: Overlap between chunks.
-            word_token_rate: Word to token conversion rate.
-            apply_chunking: Whether to apply chunking.
-            base_url: The base URL for the API request.
-            api_base: The base URL for the API request.
-            extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
-            verbose: Whether to print verbose output.
-            usages: List of individual token usages.
-            total_usage: Accumulated token usage.   
-
-        """
-        super().__init__(**kwargs)
-        self.provider = provider
-        self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY")
-        self.instruction = instruction
-        self.extract_type = extraction_type
-        self.schema = schema
-        if schema:
-            self.extract_type = "schema"
-        
-        self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
-        self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
-        self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
-        self.apply_chunking = kwargs.get("apply_chunking", True)
-        self.base_url = kwargs.get("base_url", None)
-        self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
-        self.extra_args = kwargs.get("extra_args", {})
-        if not self.apply_chunking:
-            self.chunk_token_threshold = 1e9
-        
-        self.verbose = kwargs.get("verbose", False)
-        self.usages = []  # Store individual usages
-        self.total_usage = TokenUsage()  # Accumulated usage        
-        
-        if not self.api_token:
-            raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.")
-        
-            
-    def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
-        """
-        Extract meaningful blocks or chunks from the given HTML using an LLM.
-        
-        How it works:
-        1. Construct a prompt with variables.
-        2. Make a request to the LLM using the prompt.
-        3. Parse the response and extract blocks or chunks.
-        
-        Args:
-            url: The URL of the webpage.
-            ix: Index of the block.
-            html: The HTML content of the webpage.
-            
-        Returns:
-            A list of extracted blocks or chunks.
-        """
-        if self.verbose:
-            # print("[LOG] Extracting blocks from URL:", url)
-            print(f"[LOG] Call LLM for {url} - block index: {ix}")
-
-        variable_values = {
-            "URL": url,
-            "HTML": escape_json_string(sanitize_html(html)),
-        }
-        
-        prompt_with_variables = PROMPT_EXTRACT_BLOCKS
-        if self.instruction:
-            variable_values["REQUEST"] = self.instruction
-            prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
-            
-        if self.extract_type == "schema" and self.schema:
-            variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
-            prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
-
-        for variable in variable_values:
-            prompt_with_variables = prompt_with_variables.replace(
-                "{" + variable + "}", variable_values[variable]
-            )
-        
-        response = perform_completion_with_backoff(
-            self.provider, 
-            prompt_with_variables, 
-            self.api_token, 
-            base_url=self.api_base or self.base_url,
-            extra_args = self.extra_args
-            ) # , json_response=self.extract_type == "schema")
-        # Track usage
-        usage = TokenUsage(
-            completion_tokens=response.usage.completion_tokens,
-            prompt_tokens=response.usage.prompt_tokens,
-            total_tokens=response.usage.total_tokens,
-            completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {},
-            prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {}
-        )
-        self.usages.append(usage)
-        
-        # Update totals
-        self.total_usage.completion_tokens += usage.completion_tokens
-        self.total_usage.prompt_tokens += usage.prompt_tokens 
-        self.total_usage.total_tokens += usage.total_tokens
-        
-        try:
-            blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
-            blocks = json.loads(blocks)
-            for block in blocks:
-                block['error'] = False
-        except Exception as e:
-            parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
-            blocks = parsed
-            if unparsed:
-                blocks.append({
-                    "index": 0,
-                    "error": True,
-                    "tags": ["error"],
-                    "content": unparsed
-                })
-        
-        if self.verbose:
-            print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix)
-        return blocks
-    
-    def _merge(self, documents, chunk_token_threshold, overlap):
-        """
-        Merge documents into sections based on chunk_token_threshold and overlap.
-        """
-        chunks = []
-        sections = []
-        total_tokens = 0
-
-        # Calculate the total tokens across all documents
-        for document in documents:
-            total_tokens += len(document.split(' ')) * self.word_token_rate
-
-        # Calculate the number of sections needed
-        num_sections = math.floor(total_tokens / chunk_token_threshold)
-        if num_sections < 1:
-            num_sections = 1  # Ensure there is at least one section
-        adjusted_chunk_threshold = total_tokens / num_sections
-
-        total_token_so_far = 0
-        current_chunk = []
-
-        for document in documents:
-            tokens = document.split(' ')
-            token_count = len(tokens) * self.word_token_rate
-            
-            if total_token_so_far + token_count <= adjusted_chunk_threshold:
-                current_chunk.extend(tokens)
-                total_token_so_far += token_count
-            else:
-                # Ensure to handle the last section properly
-                if len(sections) == num_sections - 1:
-                    current_chunk.extend(tokens)
-                    continue
-                
-                # Add overlap if specified
-                if overlap > 0 and current_chunk:
-                    overlap_tokens = current_chunk[-overlap:]
-                    current_chunk.extend(overlap_tokens)
-                
-                sections.append(' '.join(current_chunk))
-                current_chunk = tokens
-                total_token_so_far = token_count
-
-        # Add the last chunk
-        if current_chunk:
-            sections.append(' '.join(current_chunk))
-
-        return sections
-
-
-    def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
-        """
-        Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
-        
-        Args:
-            url: The URL of the webpage.
-            sections: List of sections (strings) to process.
-            
-        Returns:
-            A list of extracted blocks or chunks.
-        """
-        
-        merged_sections = self._merge(
-            sections, self.chunk_token_threshold,
-            overlap= int(self.chunk_token_threshold * self.overlap_rate)
-        )
-        extracted_content = []
-        if self.provider.startswith("groq/"):
-            # Sequential processing with a delay
-            for ix, section in enumerate(merged_sections):
-                extract_func = partial(self.extract, url)
-                extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
-                time.sleep(0.5)  # 500 ms delay between each processing
-        else:
-            # Parallel processing using ThreadPoolExecutor
-            # extract_func = partial(self.extract, url)
-            # for ix, section in enumerate(merged_sections):
-            #     extracted_content.append(extract_func(ix, section))            
-            
-            with ThreadPoolExecutor(max_workers=4) as executor:
-                extract_func = partial(self.extract, url)
-                futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
-                
-                for future in as_completed(futures):
-                    try:
-                        extracted_content.extend(future.result())
-                    except Exception as e:
-                        if self.verbose:
-                            print(f"Error in thread execution: {e}")
-                        # Add error information to extracted_content
-                        extracted_content.append({
-                            "index": 0,
-                            "error": True,
-                            "tags": ["error"],
-                            "content": str(e)
-                        })
-
-        
-        return extracted_content        
-    
-    
-    def show_usage(self) -> None:
-        """Print a detailed token usage report showing total and per-request usage."""
-        print("\n=== Token Usage Summary ===")
-        print(f"{'Type':<15} {'Count':>12}")
-        print("-" * 30)
-        print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
-        print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
-        print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
-
-        print("\n=== Usage History ===")
-        print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
-        print("-" * 48)
-        for i, usage in enumerate(self.usages, 1):
-            print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}")
-  
-#######################################################
-# Strategies using clustering for text data extraction #
-#######################################################
-
-class CosineStrategy(ExtractionStrategy):
-    """
-    Extract meaningful blocks or chunks from the given HTML using cosine similarity.
-    
-    How it works:
-    1. Pre-filter documents using embeddings and semantic_filter.
-    2. Perform clustering using cosine similarity.
-    3. Organize texts by their cluster labels, retaining order.
-    4. Filter clusters by word count.
-    5. Extract meaningful blocks or chunks from the filtered clusters.
-    
-    Attributes:
-        semantic_filter (str): A keyword filter for document filtering.
-        word_count_threshold (int): Minimum number of words per cluster.
-        max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
-        linkage_method (str): The linkage method for hierarchical clustering.
-        top_k (int): Number of top categories to extract.
-        model_name (str): The name of the sentence-transformers model.
-        sim_threshold (float): The similarity threshold for clustering.
-    """ 
-    def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
-        """
-        Initialize the strategy with clustering parameters.
-
-        Args:
-            semantic_filter (str): A keyword filter for document filtering.
-            word_count_threshold (int): Minimum number of words per cluster.
-            max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
-            linkage_method (str): The linkage method for hierarchical clustering.
-            top_k (int): Number of top categories to extract.
-        """
-        super().__init__(**kwargs)
-        
-        import numpy as np
-        
-        self.semantic_filter = semantic_filter
-        self.word_count_threshold = word_count_threshold
-        self.max_dist = max_dist
-        self.linkage_method = linkage_method
-        self.top_k = top_k
-        self.sim_threshold = sim_threshold
-        self.timer = time.time()
-        self.verbose = kwargs.get("verbose", False)
-        
-        self.buffer_embeddings = np.array([])
-        self.get_embedding_method = "direct"
-        
-        self.device = get_device()
-        # import torch
-        # self.device = torch.device('cpu')
-        
-        self.default_batch_size = calculate_batch_size(self.device)
-
-        if self.verbose:
-            print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
-
-        # if False and self.device.type == "cpu":
-        #     self.model = load_onnx_all_MiniLM_l6_v2()
-        #     self.tokenizer = self.model.tokenizer
-        #     self.get_embedding_method = "direct"
-        # else:
-
-        self.tokenizer, self.model = load_HF_embedding_model(model_name)
-        self.model.to(self.device)
-        self.model.eval()  
-        
-        self.get_embedding_method = "batch"
-        
-        self.buffer_embeddings = np.array([])
-
-        # if model_name == "bert-base-uncased":
-        #     self.tokenizer, self.model = load_bert_base_uncased()
-        #     self.model.eval()  # Ensure the model is in evaluation mode
-        #     self.get_embedding_method = "batch"
-        # elif model_name == "BAAI/bge-small-en-v1.5":
-        #     self.tokenizer, self.model = load_bge_small_en_v1_5()
-        #     self.model.eval()  # Ensure the model is in evaluation mode
-        #     self.get_embedding_method = "batch"
-        # elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
-        #     self.model = load_onnx_all_MiniLM_l6_v2()
-        #     self.tokenizer = self.model.tokenizer
-        #     self.get_embedding_method = "direct"
-       
-        
-        if self.verbose:
-            print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
-            
-        self.nlp, _ = load_text_multilabel_classifier()
-        # self.default_batch_size = 16 if self.device.type == 'cpu' else 64
-        
-        if self.verbose:
-            print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
-
-    def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]:
-        """
-        Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
-
-        Args:
-            documents (List[str]): A list of document texts.
-            semantic_filter (str): A keyword filter for document filtering.
-            at_least_k (int): The minimum number of documents to return.
-
-        Returns:
-            List[str]: A list of filtered and sorted document texts.
-        """
-        
-        if not semantic_filter:
-            return documents
-        
-        if len(documents) < at_least_k:
-            at_least_k = len(documents) // 2
-        
-        from sklearn.metrics.pairwise import cosine_similarity
-        
-        # Compute embedding for the keyword filter
-        query_embedding = self.get_embeddings([semantic_filter])[0]
-        
-        # Compute embeddings for the documents
-        document_embeddings = self.get_embeddings(documents)
-        
-        # Calculate cosine similarity between the query embedding and document embeddings
-        similarities = cosine_similarity([query_embedding], document_embeddings).flatten()
-        
-        # Filter documents based on the similarity threshold
-        filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold]
-        
-        # If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
-        if len(filtered_docs) < at_least_k:
-            remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold]
-            remaining_docs.sort(key=lambda x: x[1], reverse=True)
-            filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)])
-        
-        # Extract the document texts from the tuples
-        filtered_docs = [doc for doc, _ in filtered_docs]
-        
-        return filtered_docs[:at_least_k]
-    
-    def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False):
-        """
-        Get BERT embeddings for a list of sentences.
-
-        Args:
-            sentences (List[str]): A list of text chunks (sentences).
-
-        Returns:
-            NumPy array of embeddings.
-        """
-        # if self.buffer_embeddings.any() and not bypass_buffer:
-        #     return self.buffer_embeddings
-        
-        if self.device.type in [ "cpu", "gpu", "cuda", "mps"]:
-            import torch 
-            # Tokenize sentences and convert to tensor
-            if batch_size is None:
-                batch_size = self.default_batch_size
-                        
-            all_embeddings = []
-            for i in range(0, len(sentences), batch_size):
-                batch_sentences = sentences[i:i + batch_size]
-                encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
-                encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
-                
-                # Ensure no gradients are calculated
-                with torch.no_grad():
-                    model_output = self.model(**encoded_input)
-                
-                # Get embeddings from the last hidden state (mean pooling)
-                embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
-                all_embeddings.append(embeddings)
-            
-            self.buffer_embeddings = np.vstack(all_embeddings)
-        elif self.device.type == "cpu":      
-            # self.buffer_embeddings = self.model(sentences)
-            if batch_size is None:
-                batch_size = self.default_batch_size
-                
-            all_embeddings = []
-            for i in range(0, len(sentences), batch_size):
-                batch_sentences = sentences[i:i + batch_size]
-                embeddings = self.model(batch_sentences)
-                all_embeddings.append(embeddings)
-                
-            self.buffer_embeddings = np.vstack(all_embeddings)
-        return self.buffer_embeddings
-
-    def hierarchical_clustering(self, sentences: List[str], embeddings = None):
-        """
-        Perform hierarchical clustering on sentences and return cluster labels.
-
-        Args:
-            sentences (List[str]): A list of text chunks (sentences).
-
-        Returns:
-            NumPy array of cluster labels.
-        """
-        # Get embeddings
-        from scipy.cluster.hierarchy import linkage, fcluster
-        from scipy.spatial.distance import pdist
-        self.timer = time.time()
-        embeddings = self.get_embeddings(sentences, bypass_buffer=True)
-        # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
-        # Compute pairwise cosine distances
-        distance_matrix = pdist(embeddings, 'cosine')
-        # Perform agglomerative clustering respecting order
-        linked = linkage(distance_matrix, method=self.linkage_method)
-        # Form flat clusters
-        labels = fcluster(linked, self.max_dist, criterion='distance')
-        return labels
-
-    def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]:
-        """
-        Filter clusters to remove those with a word count below the threshold.
-
-        Args:
-            clusters (Dict[int, List[str]]): Dictionary of clusters.
-
-        Returns:
-            Dict[int, List[str]]: Filtered dictionary of clusters.
-        """
-        filtered_clusters = {}
-        for cluster_id, texts in clusters.items():
-            # Concatenate texts for analysis
-            full_text = " ".join(texts)
-            # Count words
-            word_count = len(full_text.split())
-            
-            # Keep clusters with word count above the threshold
-            if word_count >= self.word_count_threshold:
-                filtered_clusters[cluster_id] = texts
-
-        return filtered_clusters
-
-    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Extract clusters from HTML content using hierarchical clustering.
-
-        Args:
-            url (str): The URL of the webpage.
-            html (str): The HTML content of the webpage.
-
-        Returns:
-            List[Dict[str, Any]]: A list of processed JSON blocks.
-        """
-        # Assume `html` is a list of text chunks for this strategy
-        t = time.time()
-        text_chunks = html.split(self.DEL)  # Split by lines or paragraphs as needed
-        
-        # Pre-filter documents using embeddings and semantic_filter
-        text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter)
-
-        if not text_chunks:
-            return []
-
-        # Perform clustering
-        labels = self.hierarchical_clustering(text_chunks)
-        # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds")
-
-        # Organize texts by their cluster labels, retaining order
-        t = time.time()
-        clusters = {}
-        for index, label in enumerate(labels):
-            clusters.setdefault(label, []).append(text_chunks[index])
-
-        # Filter clusters by word count
-        filtered_clusters = self.filter_clusters_by_word_count(clusters)
-
-        # Convert filtered clusters to a sorted list of dictionaries
-        cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
-        
-        if self.verbose:
-            print(f"[LOG] 🚀 Assign tags using {self.device}")
-        
-        if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
-            labels = self.nlp([cluster['content'] for cluster in cluster_list])
-            
-            for cluster, label in zip(cluster_list, labels):
-                cluster['tags'] = label
-        # elif self.device.type == "cpu":
-        #     # Process the text with the loaded model
-        #     texts = [cluster['content'] for cluster in cluster_list]
-        #     # Batch process texts
-        #     docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
-
-        #     for doc, cluster in zip(docs, cluster_list):
-        #         tok_k = self.top_k
-        #         top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
-        #         cluster['tags'] = [cat for cat, _ in top_categories]
-                            
-            # for cluster in  cluster_list:
-            #     doc = self.nlp(cluster['content'])
-            #     tok_k = self.top_k
-            #     top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
-            #     cluster['tags'] = [cat for cat, _ in top_categories]
-        
-        if self.verbose:
-            print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
-        
-        return cluster_list
-
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Process sections using hierarchical clustering.
-
-        Args:
-            url (str): The URL of the webpage.
-            sections (List[str]): List of sections (strings) to process.
-
-        Returns:
-        """
-        # This strategy processes all sections together
-        
-        return self.extract(url, self.DEL.join(sections), **kwargs)
-    
-#######################################################
-# New extraction strategies for JSON-based extraction #
-####################################################### 
-
-class JsonElementExtractionStrategy(ExtractionStrategy):
-    """
-    Abstract base class for extracting structured JSON from HTML content.
-
-    How it works:
-    1. Parses HTML content using the `_parse_html` method.
-    2. Uses a schema to define base selectors, fields, and transformations.
-    3. Extracts data hierarchically, supporting nested fields and lists.
-    4. Handles computed fields with expressions or functions.
-
-    Attributes:
-        DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
-        schema (Dict[str, Any]): The schema defining the extraction rules.
-        verbose (bool): Enables verbose logging for debugging purposes.
-
-    Methods:
-        extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
-        _extract_item(element, fields): Extracts fields from a single element.
-        _extract_single_field(element, field): Extracts a single field based on its type.
-        _apply_transform(value, transform): Applies a transformation to a value.
-        _compute_field(item, field): Computes a field value using an expression or function.
-        run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
-
-    Abstract Methods:
-        _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
-        _get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
-        _get_elements(element, selector): Retrieves child elements using a selector.
-        _get_element_text(element): Extracts text content from an element.
-        _get_element_html(element): Extracts raw HTML from an element.
-        _get_element_attribute(element, attribute): Extracts an attribute's value from an element.
-    """
-
-    
-    DEL = '\n'
-
-    def __init__(self, schema: Dict[str, Any], **kwargs):
-        """
-        Initialize the JSON element extraction strategy with a schema.
-
-        Args:
-            schema (Dict[str, Any]): The schema defining the extraction rules.
-        """
-        super().__init__(**kwargs)
-        self.schema = schema
-        self.verbose = kwargs.get('verbose', False)
-
-    def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Extract structured data from HTML content.
-
-        How it works:
-        1. Parses the HTML content using the `_parse_html` method.
-        2. Identifies base elements using the schema's base selector.
-        3. Extracts fields from each base element using `_extract_item`.
-
-        Args:
-            url (str): The URL of the page being processed.
-            html_content (str): The raw HTML content to parse and extract.
-            *q: Additional positional arguments.
-            **kwargs: Additional keyword arguments for custom extraction.
-
-        Returns:
-            List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
-        """
-        
-        parsed_html = self._parse_html(html_content)
-        base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector'])
-        
-        results = []
-        for element in base_elements:
-            # Extract base element attributes
-            item = {}
-            if 'baseFields' in self.schema:
-                for field in self.schema['baseFields']:
-                    value = self._extract_single_field(element, field)
-                    if value is not None:
-                        item[field['name']] = value
-            
-            # Extract child fields
-            field_data = self._extract_item(element, self.schema['fields'])
-            item.update(field_data)
-            
-            if item:
-                results.append(item)
-        
-        return results
-
-    @abstractmethod
-    def _parse_html(self, html_content: str):
-        """Parse HTML content into appropriate format"""
-        pass
-
-    @abstractmethod
-    def _get_base_elements(self, parsed_html, selector: str):
-        """Get all base elements using the selector"""
-        pass
-
-    @abstractmethod
-    def _get_elements(self, element, selector: str):
-        """Get child elements using the selector"""
-        pass
-
-    def _extract_field(self, element, field):
-        try:
-            if field['type'] == 'nested':
-                nested_elements = self._get_elements(element, field['selector'])
-                nested_element = nested_elements[0] if nested_elements else None
-                return self._extract_item(nested_element, field['fields']) if nested_element else {}
-            
-            if field['type'] == 'list':
-                elements = self._get_elements(element, field['selector'])
-                return [self._extract_list_item(el, field['fields']) for el in elements]
-            
-            if field['type'] == 'nested_list':
-                elements = self._get_elements(element, field['selector'])
-                return [self._extract_item(el, field['fields']) for el in elements]
-            
-            return self._extract_single_field(element, field)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error extracting field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def _extract_single_field(self, element, field):
-        """
-        Extract a single field based on its type.
-
-        How it works:
-        1. Selects the target element using the field's selector.
-        2. Extracts the field value based on its type (e.g., text, attribute, regex).
-        3. Applies transformations if defined in the schema.
-
-        Args:
-            element: The base element to extract the field from.
-            field (Dict[str, Any]): The field definition in the schema.
-
-        Returns:
-            Any: The extracted field value.
-        """
-        
-        if 'selector' in field:
-            selected = self._get_elements(element, field['selector'])
-            if not selected:
-                return field.get('default')
-            selected = selected[0]
-        else:
-            selected = element
-
-        value = None
-        if field['type'] == 'text':
-            value = self._get_element_text(selected)
-        elif field['type'] == 'attribute':
-            value = self._get_element_attribute(selected, field['attribute'])
-        elif field['type'] == 'html':
-            value = self._get_element_html(selected)
-        elif field['type'] == 'regex':
-            text = self._get_element_text(selected)
-            match = re.search(field['pattern'], text)
-            value = match.group(1) if match else None
-
-        if 'transform' in field:
-            value = self._apply_transform(value, field['transform'])
-
-        return value if value is not None else field.get('default')
-
-    def _extract_list_item(self, element, fields):
-        item = {}
-        for field in fields:
-            value = self._extract_single_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-
-    def _extract_item(self, element, fields):
-        """
-        Extracts fields from a given element.
-
-        How it works:
-        1. Iterates through the fields defined in the schema.
-        2. Handles computed, single, and nested field types.
-        3. Updates the item dictionary with extracted field values.
-
-        Args:
-            element: The base element to extract fields from.
-            fields (List[Dict[str, Any]]): The list of fields to extract.
-
-        Returns:
-            Dict[str, Any]: A dictionary representing the extracted item.
-        """
-        
-        item = {}
-        for field in fields:
-            if field['type'] == 'computed':
-                value = self._compute_field(item, field)
-            else:
-                value = self._extract_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-
-    def _apply_transform(self, value, transform):
-        """
-        Apply a transformation to a value.
-
-        How it works:
-        1. Checks the transformation type (e.g., `lowercase`, `strip`).
-        2. Applies the transformation to the value.
-        3. Returns the transformed value.
-
-        Args:
-            value (str): The value to transform.
-            transform (str): The type of transformation to apply.
-
-        Returns:
-            str: The transformed value.
-        """
-        
-        if transform == 'lowercase':
-            return value.lower()
-        elif transform == 'uppercase':
-            return value.upper()
-        elif transform == 'strip':
-            return value.strip()
-        return value
-
-    def _compute_field(self, item, field):
-        try:
-            if 'expression' in field:
-                return eval(field['expression'], {}, item)
-            elif 'function' in field:
-                return field['function'](item)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error computing field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Run the extraction strategy on a combined HTML content.
-
-        How it works:
-        1. Combines multiple HTML sections using the `DEL` delimiter.
-        2. Calls the `extract` method with the combined HTML.
-
-        Args:
-            url (str): The URL of the page being processed.
-            sections (List[str]): A list of HTML sections.
-            *q: Additional positional arguments.
-            **kwargs: Additional keyword arguments for custom extraction.
-
-        Returns:
-            List[Dict[str, Any]]: A list of extracted items.
-        """
-        
-        combined_html = self.DEL.join(sections)
-        return self.extract(url, combined_html, **kwargs)
-
-    @abstractmethod
-    def _get_element_text(self, element) -> str:
-        """Get text content from element"""
-        pass
-
-    @abstractmethod
-    def _get_element_html(self, element) -> str:
-        """Get HTML content from element"""
-        pass
-
-    @abstractmethod
-    def _get_element_attribute(self, element, attribute: str):
-        """Get attribute value from element"""
-        pass
-
-class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
-    """
-    Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
-
-    How it works:
-    1. Parses HTML content with BeautifulSoup.
-    2. Selects elements using CSS selectors defined in the schema.
-    3. Extracts field data and applies transformations as defined.
-
-    Attributes:
-        schema (Dict[str, Any]): The schema defining the extraction rules.
-        verbose (bool): Enables verbose logging for debugging purposes.
-
-    Methods:
-        _parse_html(html_content): Parses HTML content into a BeautifulSoup object.
-        _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
-        _get_elements(element, selector): Selects child elements using a CSS selector.
-        _get_element_text(element): Extracts text content from a BeautifulSoup element.
-        _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
-        _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
-    """
-    
-    def __init__(self, schema: Dict[str, Any], **kwargs):
-        kwargs['input_format'] = 'html'  # Force HTML input
-        super().__init__(schema, **kwargs)
-
-    def _parse_html(self, html_content: str):
-        return BeautifulSoup(html_content, 'html.parser')
-
-    def _get_base_elements(self, parsed_html, selector: str):
-        return parsed_html.select(selector)
-
-    def _get_elements(self, element, selector: str):
-        selected = element.select_one(selector)
-        return [selected] if selected else []
-
-    def _get_element_text(self, element) -> str:
-        return element.get_text(strip=True)
-
-    def _get_element_html(self, element) -> str:
-        return str(element)
-
-    def _get_element_attribute(self, element, attribute: str):
-        return element.get(attribute)
-
-class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
-    """
-    Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
-
-    How it works:
-    1. Parses HTML content into an lxml tree.
-    2. Selects elements using XPath expressions.
-    3. Converts CSS selectors to XPath when needed.
-
-    Attributes:
-        schema (Dict[str, Any]): The schema defining the extraction rules.
-        verbose (bool): Enables verbose logging for debugging purposes.
-
-    Methods:
-        _parse_html(html_content): Parses HTML content into an lxml tree.
-        _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
-        _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
-        _get_elements(element, selector): Selects child elements using an XPath selector.
-        _get_element_text(element): Extracts text content from an lxml element.
-        _get_element_html(element): Extracts the raw HTML content of an lxml element.
-        _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
-    """
-    
-    def __init__(self, schema: Dict[str, Any], **kwargs):
-        kwargs['input_format'] = 'html'  # Force HTML input
-        super().__init__(schema, **kwargs)
-
-    def _parse_html(self, html_content: str):
-        return html.fromstring(html_content)
-
-    def _get_base_elements(self, parsed_html, selector: str):
-        return parsed_html.xpath(selector)
-
-    def _css_to_xpath(self, css_selector: str) -> str:
-        """Convert CSS selector to XPath if needed"""
-        if '/' in css_selector:  # Already an XPath
-            return css_selector
-        return self._basic_css_to_xpath(css_selector)
-
-    def _basic_css_to_xpath(self, css_selector: str) -> str:
-        """Basic CSS to XPath conversion for common cases"""
-        if ' > ' in css_selector:
-            parts = css_selector.split(' > ')
-            return '//' + '/'.join(parts)
-        if ' ' in css_selector:
-            parts = css_selector.split(' ')
-            return '//' + '//'.join(parts)
-        return '//' + css_selector
-
-    def _get_elements(self, element, selector: str):
-        xpath = self._css_to_xpath(selector)
-        if not xpath.startswith('.'):
-            xpath = '.' + xpath
-        return element.xpath(xpath)
-
-    def _get_element_text(self, element) -> str:
-        return ''.join(element.xpath('.//text()')).strip()
-
-    def _get_element_html(self, element) -> str:
-        return etree.tostring(element, encoding='unicode')
-
-    def _get_element_attribute(self, element, attribute: str):
-        return element.get(attribute)
- 
-
-#######################################################
-# Strategies based on the extraction of specific types#
-#######################################################
-    
-class TopicExtractionStrategy(ExtractionStrategy):
-    def __init__(self, num_keywords: int = 3, **kwargs):
-        """
-        Initialize the topic extraction strategy with parameters for topic segmentation.
-
-        :param num_keywords: Number of keywords to represent each topic segment.
-        """
-        import nltk
-        super().__init__(**kwargs)
-        self.num_keywords = num_keywords
-        self.tokenizer = nltk.TextTilingTokenizer()
-
-    def extract_keywords(self, text: str) -> List[str]:
-        """
-        Extract keywords from a given text segment using simple frequency analysis.
-
-        :param text: The text segment from which to extract keywords.
-        :return: A list of keyword strings.
-        """
-        import nltk
-        # Tokenize the text and compute word frequency
-        words = nltk.word_tokenize(text)
-        freq_dist = nltk.FreqDist(words)
-        # Get the most common words as keywords
-        keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)]
-        return keywords
-
-    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Extract topics from HTML content using TextTiling for segmentation and keyword extraction.
-
-        :param url: The URL of the webpage.
-        :param html: The HTML content of the webpage.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A list of dictionaries representing the topics.
-        """
-        # Use TextTiling to segment the text into topics
-        segmented_topics = html.split(self.DEL)  # Split by lines or paragraphs as needed
-
-        # Prepare the output as a list of dictionaries
-        topic_list = []
-        for i, segment in enumerate(segmented_topics):
-            # Extract keywords for each segment
-            keywords = self.extract_keywords(segment)
-            topic_list.append({
-                "index": i,
-                "content": segment,
-                "keywords": keywords
-            })
-
-        return topic_list
-
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Process sections using topic segmentation and keyword extraction.
-
-        :param url: The URL of the webpage.
-        :param sections: List of sections (strings) to process.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A list of processed JSON blocks.
-        """
-        # Concatenate sections into a single text for coherent topic segmentation
-        
-        
-        return self.extract(url, self.DEL.join(sections), **kwargs)
-    
-class ContentSummarizationStrategy(ExtractionStrategy):
-    def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs):
-        """
-        Initialize the content summarization strategy with a specific model.
-
-        :param model_name: The model to use for summarization.
-        """
-        super().__init__(**kwargs)
-        from transformers import pipeline
-        self.summarizer = pipeline("summarization", model=model_name)
-
-    def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
-        """
-        Summarize a single section of text.
-
-        :param url: The URL of the webpage.
-        :param text: A section of text to summarize.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A dictionary with the summary.
-        """
-        try:
-            summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False)
-            return {"summary": summary[0]['summary_text']}
-        except Exception as e:
-            print(f"Error summarizing text: {e}")
-            return {"summary": text}  # Fallback to original text if summarization fails
-
-    def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
-        """
-        Process each section in parallel to produce summaries.
-
-        :param url: The URL of the webpage.
-        :param sections: List of sections (strings) to summarize.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A list of dictionaries with summaries for each section.
-        """
-        # Use a ThreadPoolExecutor to summarize in parallel
-        summaries = []
-        with ThreadPoolExecutor() as executor:
-            # Create a future for each section's summarization
-            future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)}
-            for future in as_completed(future_to_section):
-                section_index = future_to_section[future]
-                try:
-                    summary_result = future.result()
-                    summaries.append((section_index, summary_result))
-                except Exception as e:
-                    print(f"Error processing section {section_index}: {e}")
-                    summaries.append((section_index, {"summary": sections[section_index]}))  # Fallback to original text
-
-        # Sort summaries by the original section index to maintain order
-        summaries.sort(key=lambda x: x[0])
-        return [summary for _, summary in summaries]
- 
-#######################################################
-# Deprecated strategies
-#######################################################
- 
-class _JsonCssExtractionStrategy(ExtractionStrategy):
-    def __init__(self, schema: Dict[str, Any], **kwargs):
-        kwargs['input_format'] = 'html'  # Force HTML input
-        super().__init__(**kwargs)
-        self.schema = schema
-
-    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        soup = BeautifulSoup(html, 'html.parser')
-        base_elements = soup.select(self.schema['baseSelector'])
-        
-        results = []
-        for element in base_elements:
-            # Extract base element attributes first
-            item = {}
-            if 'baseFields' in self.schema:
-                for field in self.schema['baseFields']:
-                    value = self._extract_single_field(element, field)
-                    if value is not None:
-                        item[field['name']] = value
-            
-            # Then extract child fields
-            field_data = self._extract_item(element, self.schema['fields'])
-            item.update(field_data)
-            
-            results.append(item)
-        
-        return results
-
-    def _extract_field(self, element, field):
-        try:
-            if field['type'] == 'nested':
-                nested_element = element.select_one(field['selector'])
-                return self._extract_item(nested_element, field['fields']) if nested_element else {}
-            
-            if field['type'] == 'list':
-                elements = element.select(field['selector'])
-                return [self._extract_list_item(el, field['fields']) for el in elements]
-            
-            if field['type'] == 'nested_list':
-                elements = element.select(field['selector'])
-                return [self._extract_item(el, field['fields']) for el in elements]
-            
-            return self._extract_single_field(element, field)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error extracting field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def _extract_list_item(self, element, fields):
-        item = {}
-        for field in fields:
-            value = self._extract_single_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-    
-    def _extract_single_field(self, element, field):
-        if 'selector' in field:
-            selected = element.select_one(field['selector'])
-            if not selected:
-                return field.get('default')
-        else:
-            selected = element
-
-        value = None
-        if field['type'] == 'text':
-            value = selected.get_text(strip=True)
-        elif field['type'] == 'attribute':
-            value = selected.get(field['attribute'])
-        elif field['type'] == 'html':
-            value = str(selected)
-        elif field['type'] == 'regex':
-            text = selected.get_text(strip=True)
-            match = re.search(field['pattern'], text)
-            value = match.group(1) if match else None
-
-        if 'transform' in field:
-            value = self._apply_transform(value, field['transform'])
-
-        return value if value is not None else field.get('default')
-
-    def _extract_item(self, element, fields):
-        item = {}
-        for field in fields:
-            if field['type'] == 'computed':
-                value = self._compute_field(item, field)
-            else:
-                value = self._extract_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-    
-    def _apply_transform(self, value, transform):
-        if transform == 'lowercase':
-            return value.lower()
-        elif transform == 'uppercase':
-            return value.upper()
-        elif transform == 'strip':
-            return value.strip()
-        return value
-
-    def _compute_field(self, item, field):
-        try:
-            if 'expression' in field:
-                return eval(field['expression'], {}, item)
-            elif 'function' in field:
-                return field['function'](item)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error computing field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        combined_html = self.DEL.join(sections)
-        return self.extract(url, combined_html, **kwargs)
-class _JsonXPathExtractionStrategy(ExtractionStrategy):
-    def __init__(self, schema: Dict[str, Any], **kwargs):
-        kwargs['input_format'] = 'html'  # Force HTML input
-        super().__init__(**kwargs)
-        self.schema = schema
-
-    def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        tree = html.fromstring(html_content)
-        base_xpath = self.schema['baseSelector']
-        base_elements = tree.xpath(base_xpath)
-        
-        results = []
-        for element in base_elements:
-            # Extract base element attributes first
-            item = {}
-            if 'baseFields' in self.schema:
-                for field in self.schema['baseFields']:
-                    value = self._extract_single_field(element, field)
-                    if value is not None:
-                        item[field['name']] = value
-            
-            # Then extract child fields
-            field_data = self._extract_item(element, self.schema['fields'])
-            item.update(field_data)
-            
-            results.append(item)
-        
-        return results
-
-    def _css_to_xpath(self, css_selector: str) -> str:
-        """Convert CSS selector to XPath if needed"""
-        if '/' in css_selector:  # Already an XPath
-            return css_selector
-        else:
-            # Fallback to basic conversion for common cases
-            return self._basic_css_to_xpath(css_selector)
-
-    def _basic_css_to_xpath(self, css_selector: str) -> str:
-        """Basic CSS to XPath conversion for common cases"""
-        # Handle basic cases
-        if ' > ' in css_selector:
-            parts = css_selector.split(' > ')
-            return '//' + '/'.join(parts)
-        if ' ' in css_selector:
-            parts = css_selector.split(' ')
-            return '//' + '//'.join(parts)
-        return '//' + css_selector
-
-    def _extract_field(self, element, field):
-        try:
-            if field['type'] == 'nested':
-                xpath = self._css_to_xpath(field['selector'])
-                nested_element = element.xpath(xpath)[0] if element.xpath(xpath) else None
-                return self._extract_item(nested_element, field['fields']) if nested_element is not None else {}
-            
-            if field['type'] == 'list':
-                xpath = self._css_to_xpath(field['selector'])
-                elements = element.xpath(xpath)
-                return [self._extract_list_item(el, field['fields']) for el in elements]
-            
-            if field['type'] == 'nested_list':
-                xpath = self._css_to_xpath(field['selector'])
-                elements = element.xpath(xpath)
-                return [self._extract_item(el, field['fields']) for el in elements]
-            
-            return self._extract_single_field(element, field)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error extracting field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def _extract_list_item(self, element, fields):
-        item = {}
-        for field in fields:
-            value = self._extract_single_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-    
-    def _extract_single_field(self, element, field):
-        if 'selector' in field:
-            xpath = self._css_to_xpath(field['selector'])
-            selected = element.xpath(xpath)
-            if not selected:
-                return field.get('default')
-            selected = selected[0]
-        else:
-            selected = element
-
-        value = None
-        if field['type'] == 'text':
-            value = ''.join(selected.xpath('.//text()')).strip()
-        elif field['type'] == 'attribute':
-            value = selected.get(field['attribute'])
-        elif field['type'] == 'html':
-            value = etree.tostring(selected, encoding='unicode')
-        elif field['type'] == 'regex':
-            text = ''.join(selected.xpath('.//text()')).strip()
-            match = re.search(field['pattern'], text)
-            value = match.group(1) if match else None
-
-        if 'transform' in field:
-            value = self._apply_transform(value, field['transform'])
-
-        return value if value is not None else field.get('default')
-
-    def _extract_item(self, element, fields):
-        item = {}
-        for field in fields:
-            if field['type'] == 'computed':
-                value = self._compute_field(item, field)
-            else:
-                value = self._extract_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-    
-    def _apply_transform(self, value, transform):
-        if transform == 'lowercase':
-            return value.lower()
-        elif transform == 'uppercase':
-            return value.upper()
-        elif transform == 'strip':
-            return value.strip()
-        return value
-
-    def _compute_field(self, item, field):
-        try:
-            if 'expression' in field:
-                return eval(field['expression'], {}, item)
-            elif 'function' in field:
-                return field['function'](item)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error computing field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        combined_html = self.DEL.join(sections)
-        return self.extract(url, combined_html, **kwargs)    
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 3e688f13..b2b24751 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1,26 +1,52 @@
 from abc import ABC, abstractmethod
-from typing import Any, List, Dict, Optional, Union
+from typing import Any, List, Dict, Optional
 from concurrent.futures import ThreadPoolExecutor, as_completed
-import json, time
-# from optimum.intel import IPEXModel
-from .prompts import *
-from .config import *
-from .utils import *
-from .models import *
+import json
+import time
+import os
+
+from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH
+from .config import (
+    DEFAULT_PROVIDER, PROVIDER_MODELS, 
+    CHUNK_TOKEN_THRESHOLD,
+    OVERLAP_RATE,
+    WORD_TOKEN_RATE,
+)
+from .utils import *  # noqa: F403
+
+from .utils import (
+    sanitize_html,
+    escape_json_string,
+    perform_completion_with_backoff,
+    extract_xml_data,
+    split_and_parse_json_objects,
+    sanitize_input_encode,
+)
+from .models import * # noqa: F403
+
+from .models import TokenUsage
+
+from .model_loader import * # noqa: F403
+from .model_loader import (
+    get_device,
+    load_HF_embedding_model,
+    load_text_multilabel_classifier,
+    calculate_batch_size
+)
+
 from functools import partial
-from .model_loader import *
 import math
 import numpy as np
 import re
 from bs4 import BeautifulSoup
 from lxml import html, etree
-from dataclasses import dataclass
+
 
 class ExtractionStrategy(ABC):
     """
     Abstract base class for all extraction strategies.
     """
-    
+
     def __init__(self, input_format: str = "markdown", **kwargs):
         """
         Initialize the extraction strategy.
@@ -45,7 +71,7 @@ class ExtractionStrategy(ABC):
         :return: A list of extracted blocks or chunks.
         """
         pass
-    
+
     def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
         """
         Process sections of text in parallel by default.
@@ -56,313 +82,49 @@ class ExtractionStrategy(ABC):
         """
         extracted_content = []
         with ThreadPoolExecutor() as executor:
-            futures = [executor.submit(self.extract, url, section, **kwargs) for section in sections]
+            futures = [
+                executor.submit(self.extract, url, section, **kwargs)
+                for section in sections
+            ]
             for future in as_completed(futures):
                 extracted_content.extend(future.result())
-        return extracted_content    
-    
+        return extracted_content
+
+
 class NoExtractionStrategy(ExtractionStrategy):
     """
     A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
     """
+
     def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
         """
         Extract meaningful blocks or chunks from the given HTML.
         """
         return [{"index": 0, "content": html}]
-    
+
     def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
-
-#######################################################
-# Strategies using LLM-based extraction for text data #
-#######################################################
-class LLMExtractionStrategy(ExtractionStrategy):
-    """
-    A strategy that uses an LLM to extract meaningful content from the HTML.
-    
-    Attributes:
-        provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
-        api_token: The API token for the provider.
-        instruction: The instruction to use for the LLM model.  
-        schema: Pydantic model schema for structured data.
-        extraction_type: "block" or "schema".
-        chunk_token_threshold: Maximum tokens per chunk.
-        overlap_rate: Overlap between chunks.
-        word_token_rate: Word to token conversion rate.
-        apply_chunking: Whether to apply chunking.
-        base_url: The base URL for the API request.
-        api_base: The base URL for the API request.
-        extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
-        verbose: Whether to print verbose output.
-        usages: List of individual token usages.
-        total_usage: Accumulated token usage.
-    """
-
-    def __init__(self, 
-                 provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, 
-                 instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
-        """
-        Initialize the strategy with clustering parameters.
-        
-        Args:
-            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
-            api_token: The API token for the provider.
-            instruction: The instruction to use for the LLM model.  
-            schema: Pydantic model schema for structured data.
-            extraction_type: "block" or "schema".
-            chunk_token_threshold: Maximum tokens per chunk.
-            overlap_rate: Overlap between chunks.
-            word_token_rate: Word to token conversion rate.
-            apply_chunking: Whether to apply chunking.
-            base_url: The base URL for the API request.
-            api_base: The base URL for the API request.
-            extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
-            verbose: Whether to print verbose output.
-            usages: List of individual token usages.
-            total_usage: Accumulated token usage.   
-
-        """
-        super().__init__(**kwargs)
-        self.provider = provider
-        self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY")
-        self.instruction = instruction
-        self.extract_type = extraction_type
-        self.schema = schema
-        if schema:
-            self.extract_type = "schema"
-        
-        self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
-        self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
-        self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
-        self.apply_chunking = kwargs.get("apply_chunking", True)
-        self.base_url = kwargs.get("base_url", None)
-        self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
-        self.extra_args = kwargs.get("extra_args", {})
-        if not self.apply_chunking:
-            self.chunk_token_threshold = 1e9
-        
-        self.verbose = kwargs.get("verbose", False)
-        self.usages = []  # Store individual usages
-        self.total_usage = TokenUsage()  # Accumulated usage        
-        
-        if not self.api_token:
-            raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.")
-        
-            
-    def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
-        """
-        Extract meaningful blocks or chunks from the given HTML using an LLM.
-        
-        How it works:
-        1. Construct a prompt with variables.
-        2. Make a request to the LLM using the prompt.
-        3. Parse the response and extract blocks or chunks.
-        
-        Args:
-            url: The URL of the webpage.
-            ix: Index of the block.
-            html: The HTML content of the webpage.
-            
-        Returns:
-            A list of extracted blocks or chunks.
-        """
-        if self.verbose:
-            # print("[LOG] Extracting blocks from URL:", url)
-            print(f"[LOG] Call LLM for {url} - block index: {ix}")
-
-        variable_values = {
-            "URL": url,
-            "HTML": escape_json_string(sanitize_html(html)),
-        }
-        
-        prompt_with_variables = PROMPT_EXTRACT_BLOCKS
-        if self.instruction:
-            variable_values["REQUEST"] = self.instruction
-            prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
-            
-        if self.extract_type == "schema" and self.schema:
-            variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
-            prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
-
-        for variable in variable_values:
-            prompt_with_variables = prompt_with_variables.replace(
-                "{" + variable + "}", variable_values[variable]
-            )
-        
-        response = perform_completion_with_backoff(
-            self.provider, 
-            prompt_with_variables, 
-            self.api_token, 
-            base_url=self.api_base or self.base_url,
-            extra_args = self.extra_args
-            ) # , json_response=self.extract_type == "schema")
-        # Track usage
-        usage = TokenUsage(
-            completion_tokens=response.usage.completion_tokens,
-            prompt_tokens=response.usage.prompt_tokens,
-            total_tokens=response.usage.total_tokens,
-            completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {},
-            prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {}
-        )
-        self.usages.append(usage)
-        
-        # Update totals
-        self.total_usage.completion_tokens += usage.completion_tokens
-        self.total_usage.prompt_tokens += usage.prompt_tokens 
-        self.total_usage.total_tokens += usage.total_tokens
-        
-        try:
-            blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
-            blocks = json.loads(blocks)
-            for block in blocks:
-                block['error'] = False
-        except Exception as e:
-            parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
-            blocks = parsed
-            if unparsed:
-                blocks.append({
-                    "index": 0,
-                    "error": True,
-                    "tags": ["error"],
-                    "content": unparsed
-                })
-        
-        if self.verbose:
-            print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix)
-        return blocks
-    
-    def _merge(self, documents, chunk_token_threshold, overlap):
-        """
-        Merge documents into sections based on chunk_token_threshold and overlap.
-        """
-        chunks = []
-        sections = []
-        total_tokens = 0
-
-        # Calculate the total tokens across all documents
-        for document in documents:
-            total_tokens += len(document.split(' ')) * self.word_token_rate
-
-        # Calculate the number of sections needed
-        num_sections = math.floor(total_tokens / chunk_token_threshold)
-        if num_sections < 1:
-            num_sections = 1  # Ensure there is at least one section
-        adjusted_chunk_threshold = total_tokens / num_sections
-
-        total_token_so_far = 0
-        current_chunk = []
-
-        for document in documents:
-            tokens = document.split(' ')
-            token_count = len(tokens) * self.word_token_rate
-            
-            if total_token_so_far + token_count <= adjusted_chunk_threshold:
-                current_chunk.extend(tokens)
-                total_token_so_far += token_count
-            else:
-                # Ensure to handle the last section properly
-                if len(sections) == num_sections - 1:
-                    current_chunk.extend(tokens)
-                    continue
-                
-                # Add overlap if specified
-                if overlap > 0 and current_chunk:
-                    overlap_tokens = current_chunk[-overlap:]
-                    current_chunk.extend(overlap_tokens)
-                
-                sections.append(' '.join(current_chunk))
-                current_chunk = tokens
-                total_token_so_far = token_count
-
-        # Add the last chunk
-        if current_chunk:
-            sections.append(' '.join(current_chunk))
-
-        return sections
+        return [
+            {"index": i, "tags": [], "content": section}
+            for i, section in enumerate(sections)
+        ]
 
 
-    def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
-        """
-        Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
-        
-        Args:
-            url: The URL of the webpage.
-            sections: List of sections (strings) to process.
-            
-        Returns:
-            A list of extracted blocks or chunks.
-        """
-        
-        merged_sections = self._merge(
-            sections, self.chunk_token_threshold,
-            overlap= int(self.chunk_token_threshold * self.overlap_rate)
-        )
-        extracted_content = []
-        if self.provider.startswith("groq/"):
-            # Sequential processing with a delay
-            for ix, section in enumerate(merged_sections):
-                extract_func = partial(self.extract, url)
-                extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
-                time.sleep(0.5)  # 500 ms delay between each processing
-        else:
-            # Parallel processing using ThreadPoolExecutor
-            # extract_func = partial(self.extract, url)
-            # for ix, section in enumerate(merged_sections):
-            #     extracted_content.append(extract_func(ix, section))            
-            
-            with ThreadPoolExecutor(max_workers=4) as executor:
-                extract_func = partial(self.extract, url)
-                futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
-                
-                for future in as_completed(futures):
-                    try:
-                        extracted_content.extend(future.result())
-                    except Exception as e:
-                        if self.verbose:
-                            print(f"Error in thread execution: {e}")
-                        # Add error information to extracted_content
-                        extracted_content.append({
-                            "index": 0,
-                            "error": True,
-                            "tags": ["error"],
-                            "content": str(e)
-                        })
-
-        
-        return extracted_content        
-    
-    
-    def show_usage(self) -> None:
-        """Print a detailed token usage report showing total and per-request usage."""
-        print("\n=== Token Usage Summary ===")
-        print(f"{'Type':<15} {'Count':>12}")
-        print("-" * 30)
-        print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
-        print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
-        print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
-
-        print("\n=== Usage History ===")
-        print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
-        print("-" * 48)
-        for i, usage in enumerate(self.usages, 1):
-            print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}")
-  
 #######################################################
 # Strategies using clustering for text data extraction #
 #######################################################
 
+
 class CosineStrategy(ExtractionStrategy):
     """
     Extract meaningful blocks or chunks from the given HTML using cosine similarity.
-    
+
     How it works:
     1. Pre-filter documents using embeddings and semantic_filter.
     2. Perform clustering using cosine similarity.
     3. Organize texts by their cluster labels, retaining order.
     4. Filter clusters by word count.
     5. Extract meaningful blocks or chunks from the filtered clusters.
-    
+
     Attributes:
         semantic_filter (str): A keyword filter for document filtering.
         word_count_threshold (int): Minimum number of words per cluster.
@@ -371,8 +133,19 @@ class CosineStrategy(ExtractionStrategy):
         top_k (int): Number of top categories to extract.
         model_name (str): The name of the sentence-transformers model.
         sim_threshold (float): The similarity threshold for clustering.
-    """ 
-    def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
+    """
+
+    def __init__(
+        self,
+        semantic_filter=None,
+        word_count_threshold=10,
+        max_dist=0.2,
+        linkage_method="ward",
+        top_k=3,
+        model_name="sentence-transformers/all-MiniLM-L6-v2",
+        sim_threshold=0.3,
+        **kwargs,
+    ):
         """
         Initialize the strategy with clustering parameters.
 
@@ -384,9 +157,9 @@ class CosineStrategy(ExtractionStrategy):
             top_k (int): Number of top categories to extract.
         """
         super().__init__(**kwargs)
-        
+
         import numpy as np
-        
+
         self.semantic_filter = semantic_filter
         self.word_count_threshold = word_count_threshold
         self.max_dist = max_dist
@@ -395,14 +168,14 @@ class CosineStrategy(ExtractionStrategy):
         self.sim_threshold = sim_threshold
         self.timer = time.time()
         self.verbose = kwargs.get("verbose", False)
-        
+
         self.buffer_embeddings = np.array([])
         self.get_embedding_method = "direct"
-        
+
         self.device = get_device()
         # import torch
         # self.device = torch.device('cpu')
-        
+
         self.default_batch_size = calculate_batch_size(self.device)
 
         if self.verbose:
@@ -416,10 +189,10 @@ class CosineStrategy(ExtractionStrategy):
 
         self.tokenizer, self.model = load_HF_embedding_model(model_name)
         self.model.to(self.device)
-        self.model.eval()  
-        
+        self.model.eval()
+
         self.get_embedding_method = "batch"
-        
+
         self.buffer_embeddings = np.array([])
 
         # if model_name == "bert-base-uncased":
@@ -434,18 +207,23 @@ class CosineStrategy(ExtractionStrategy):
         #     self.model = load_onnx_all_MiniLM_l6_v2()
         #     self.tokenizer = self.model.tokenizer
         #     self.get_embedding_method = "direct"
-       
-        
+
         if self.verbose:
             print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
-            
+
         self.nlp, _ = load_text_multilabel_classifier()
         # self.default_batch_size = 16 if self.device.type == 'cpu' else 64
-        
-        if self.verbose:
-            print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds")
 
-    def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]:
+        if self.verbose:
+            print(
+                f"[LOG] Model loaded {model_name}, models/reuters, took "
+                + str(time.time() - self.timer)
+                + " seconds"
+            )
+
+    def filter_documents_embeddings(
+        self, documents: List[str], semantic_filter: str, at_least_k: int = 20
+    ) -> List[str]:
         """
         Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
 
@@ -457,39 +235,51 @@ class CosineStrategy(ExtractionStrategy):
         Returns:
             List[str]: A list of filtered and sorted document texts.
         """
-        
+
         if not semantic_filter:
             return documents
-        
+
         if len(documents) < at_least_k:
             at_least_k = len(documents) // 2
-        
+
         from sklearn.metrics.pairwise import cosine_similarity
-        
+
         # Compute embedding for the keyword filter
         query_embedding = self.get_embeddings([semantic_filter])[0]
-        
+
         # Compute embeddings for the documents
         document_embeddings = self.get_embeddings(documents)
-        
+
         # Calculate cosine similarity between the query embedding and document embeddings
-        similarities = cosine_similarity([query_embedding], document_embeddings).flatten()
-        
+        similarities = cosine_similarity(
+            [query_embedding], document_embeddings
+        ).flatten()
+
         # Filter documents based on the similarity threshold
-        filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold]
-        
+        filtered_docs = [
+            (doc, sim)
+            for doc, sim in zip(documents, similarities)
+            if sim >= self.sim_threshold
+        ]
+
         # If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
         if len(filtered_docs) < at_least_k:
-            remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold]
+            remaining_docs = [
+                (doc, sim)
+                for doc, sim in zip(documents, similarities)
+                if sim < self.sim_threshold
+            ]
             remaining_docs.sort(key=lambda x: x[1], reverse=True)
-            filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)])
-        
+            filtered_docs.extend(remaining_docs[: at_least_k - len(filtered_docs)])
+
         # Extract the document texts from the tuples
         filtered_docs = [doc for doc, _ in filtered_docs]
-        
+
         return filtered_docs[:at_least_k]
-    
-    def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False):
+
+    def get_embeddings(
+        self, sentences: List[str], batch_size=None, bypass_buffer=False
+    ):
         """
         Get BERT embeddings for a list of sentences.
 
@@ -501,43 +291,48 @@ class CosineStrategy(ExtractionStrategy):
         """
         # if self.buffer_embeddings.any() and not bypass_buffer:
         #     return self.buffer_embeddings
-        
-        if self.device.type in [ "cpu", "gpu", "cuda", "mps"]:
-            import torch 
+
+        if self.device.type in ["cpu", "gpu", "cuda", "mps"]:
+            import torch
+
             # Tokenize sentences and convert to tensor
             if batch_size is None:
                 batch_size = self.default_batch_size
-                        
+
             all_embeddings = []
             for i in range(0, len(sentences), batch_size):
-                batch_sentences = sentences[i:i + batch_size]
-                encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
-                encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
-                
+                batch_sentences = sentences[i : i + batch_size]
+                encoded_input = self.tokenizer(
+                    batch_sentences, padding=True, truncation=True, return_tensors="pt"
+                )
+                encoded_input = {
+                    key: tensor.to(self.device) for key, tensor in encoded_input.items()
+                }
+
                 # Ensure no gradients are calculated
                 with torch.no_grad():
                     model_output = self.model(**encoded_input)
-                
+
                 # Get embeddings from the last hidden state (mean pooling)
                 embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
                 all_embeddings.append(embeddings)
-            
+
             self.buffer_embeddings = np.vstack(all_embeddings)
-        elif self.device.type == "cpu":      
+        elif self.device.type == "cpu":
             # self.buffer_embeddings = self.model(sentences)
             if batch_size is None:
                 batch_size = self.default_batch_size
-                
+
             all_embeddings = []
             for i in range(0, len(sentences), batch_size):
-                batch_sentences = sentences[i:i + batch_size]
+                batch_sentences = sentences[i : i + batch_size]
                 embeddings = self.model(batch_sentences)
                 all_embeddings.append(embeddings)
-                
+
             self.buffer_embeddings = np.vstack(all_embeddings)
         return self.buffer_embeddings
 
-    def hierarchical_clustering(self, sentences: List[str], embeddings = None):
+    def hierarchical_clustering(self, sentences: List[str], embeddings=None):
         """
         Perform hierarchical clustering on sentences and return cluster labels.
 
@@ -550,18 +345,21 @@ class CosineStrategy(ExtractionStrategy):
         # Get embeddings
         from scipy.cluster.hierarchy import linkage, fcluster
         from scipy.spatial.distance import pdist
+
         self.timer = time.time()
         embeddings = self.get_embeddings(sentences, bypass_buffer=True)
         # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
         # Compute pairwise cosine distances
-        distance_matrix = pdist(embeddings, 'cosine')
+        distance_matrix = pdist(embeddings, "cosine")
         # Perform agglomerative clustering respecting order
         linked = linkage(distance_matrix, method=self.linkage_method)
         # Form flat clusters
-        labels = fcluster(linked, self.max_dist, criterion='distance')
+        labels = fcluster(linked, self.max_dist, criterion="distance")
         return labels
 
-    def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]:
+    def filter_clusters_by_word_count(
+        self, clusters: Dict[int, List[str]]
+    ) -> Dict[int, List[str]]:
         """
         Filter clusters to remove those with a word count below the threshold.
 
@@ -577,7 +375,7 @@ class CosineStrategy(ExtractionStrategy):
             full_text = " ".join(texts)
             # Count words
             word_count = len(full_text.split())
-            
+
             # Keep clusters with word count above the threshold
             if word_count >= self.word_count_threshold:
                 filtered_clusters[cluster_id] = texts
@@ -598,9 +396,11 @@ class CosineStrategy(ExtractionStrategy):
         # Assume `html` is a list of text chunks for this strategy
         t = time.time()
         text_chunks = html.split(self.DEL)  # Split by lines or paragraphs as needed
-        
+
         # Pre-filter documents using embeddings and semantic_filter
-        text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter)
+        text_chunks = self.filter_documents_embeddings(
+            text_chunks, self.semantic_filter
+        )
 
         if not text_chunks:
             return []
@@ -619,16 +419,19 @@ class CosineStrategy(ExtractionStrategy):
         filtered_clusters = self.filter_clusters_by_word_count(clusters)
 
         # Convert filtered clusters to a sorted list of dictionaries
-        cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
-        
+        cluster_list = [
+            {"index": int(idx), "tags": [], "content": " ".join(filtered_clusters[idx])}
+            for idx in sorted(filtered_clusters)
+        ]
+
         if self.verbose:
             print(f"[LOG] 🚀 Assign tags using {self.device}")
-        
+
         if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
-            labels = self.nlp([cluster['content'] for cluster in cluster_list])
-            
+            labels = self.nlp([cluster["content"] for cluster in cluster_list])
+
             for cluster, label in zip(cluster_list, labels):
-                cluster['tags'] = label
+                cluster["tags"] = label
         # elif self.device.type == "cpu":
         #     # Process the text with the loaded model
         #     texts = [cluster['content'] for cluster in cluster_list]
@@ -639,16 +442,16 @@ class CosineStrategy(ExtractionStrategy):
         #         tok_k = self.top_k
         #         top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
         #         cluster['tags'] = [cat for cat, _ in top_categories]
-                            
-            # for cluster in  cluster_list:
-            #     doc = self.nlp(cluster['content'])
-            #     tok_k = self.top_k
-            #     top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
-            #     cluster['tags'] = [cat for cat, _ in top_categories]
-        
+
+        # for cluster in  cluster_list:
+        #     doc = self.nlp(cluster['content'])
+        #     tok_k = self.top_k
+        #     top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+        #     cluster['tags'] = [cat for cat, _ in top_categories]
+
         if self.verbose:
             print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
-        
+
         return cluster_list
 
     def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
@@ -662,12 +465,321 @@ class CosineStrategy(ExtractionStrategy):
         Returns:
         """
         # This strategy processes all sections together
-        
+
         return self.extract(url, self.DEL.join(sections), **kwargs)
-    
+
+
+#######################################################
+# Strategies using LLM-based extraction for text data #
+#######################################################
+class LLMExtractionStrategy(ExtractionStrategy):
+    """
+    A strategy that uses an LLM to extract meaningful content from the HTML.
+
+    Attributes:
+        provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
+        api_token: The API token for the provider.
+        instruction: The instruction to use for the LLM model.
+        schema: Pydantic model schema for structured data.
+        extraction_type: "block" or "schema".
+        chunk_token_threshold: Maximum tokens per chunk.
+        overlap_rate: Overlap between chunks.
+        word_token_rate: Word to token conversion rate.
+        apply_chunking: Whether to apply chunking.
+        base_url: The base URL for the API request.
+        api_base: The base URL for the API request.
+        extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
+        verbose: Whether to print verbose output.
+        usages: List of individual token usages.
+        total_usage: Accumulated token usage.
+    """
+
+    def __init__(
+        self,
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        instruction: str = None,
+        schema: Dict = None,
+        extraction_type="block",
+        **kwargs,
+    ):
+        """
+        Initialize the strategy with clustering parameters.
+
+        Args:
+            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
+            api_token: The API token for the provider.
+            instruction: The instruction to use for the LLM model.
+            schema: Pydantic model schema for structured data.
+            extraction_type: "block" or "schema".
+            chunk_token_threshold: Maximum tokens per chunk.
+            overlap_rate: Overlap between chunks.
+            word_token_rate: Word to token conversion rate.
+            apply_chunking: Whether to apply chunking.
+            base_url: The base URL for the API request.
+            api_base: The base URL for the API request.
+            extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
+            verbose: Whether to print verbose output.
+            usages: List of individual token usages.
+            total_usage: Accumulated token usage.
+
+        """
+        super().__init__(**kwargs)
+        self.provider = provider
+        self.api_token = (
+            api_token
+            or PROVIDER_MODELS.get(provider, "no-token")
+            or os.getenv("OPENAI_API_KEY")
+        )
+        self.instruction = instruction
+        self.extract_type = extraction_type
+        self.schema = schema
+        if schema:
+            self.extract_type = "schema"
+
+        self.chunk_token_threshold = kwargs.get(
+            "chunk_token_threshold", CHUNK_TOKEN_THRESHOLD
+        )
+        self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
+        self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
+        self.apply_chunking = kwargs.get("apply_chunking", True)
+        self.base_url = kwargs.get("base_url", None)
+        self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
+        self.extra_args = kwargs.get("extra_args", {})
+        if not self.apply_chunking:
+            self.chunk_token_threshold = 1e9
+
+        self.verbose = kwargs.get("verbose", False)
+        self.usages = []  # Store individual usages
+        self.total_usage = TokenUsage()  # Accumulated usage
+
+        if not self.api_token:
+            raise ValueError(
+                "API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable."
+            )
+
+    def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML using an LLM.
+
+        How it works:
+        1. Construct a prompt with variables.
+        2. Make a request to the LLM using the prompt.
+        3. Parse the response and extract blocks or chunks.
+
+        Args:
+            url: The URL of the webpage.
+            ix: Index of the block.
+            html: The HTML content of the webpage.
+
+        Returns:
+            A list of extracted blocks or chunks.
+        """
+        if self.verbose:
+            # print("[LOG] Extracting blocks from URL:", url)
+            print(f"[LOG] Call LLM for {url} - block index: {ix}")
+
+        variable_values = {
+            "URL": url,
+            "HTML": escape_json_string(sanitize_html(html)),
+        }
+
+        prompt_with_variables = PROMPT_EXTRACT_BLOCKS
+        if self.instruction:
+            variable_values["REQUEST"] = self.instruction
+            prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
+
+        if self.extract_type == "schema" and self.schema:
+            variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
+            prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
+
+        for variable in variable_values:
+            prompt_with_variables = prompt_with_variables.replace(
+                "{" + variable + "}", variable_values[variable]
+            )
+
+        response = perform_completion_with_backoff(
+            self.provider,
+            prompt_with_variables,
+            self.api_token,
+            base_url=self.api_base or self.base_url,
+            extra_args=self.extra_args,
+        )  # , json_response=self.extract_type == "schema")
+        # Track usage
+        usage = TokenUsage(
+            completion_tokens=response.usage.completion_tokens,
+            prompt_tokens=response.usage.prompt_tokens,
+            total_tokens=response.usage.total_tokens,
+            completion_tokens_details=response.usage.completion_tokens_details.__dict__
+            if response.usage.completion_tokens_details
+            else {},
+            prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
+            if response.usage.prompt_tokens_details
+            else {},
+        )
+        self.usages.append(usage)
+
+        # Update totals
+        self.total_usage.completion_tokens += usage.completion_tokens
+        self.total_usage.prompt_tokens += usage.prompt_tokens
+        self.total_usage.total_tokens += usage.total_tokens
+
+        try:
+            blocks = extract_xml_data(["blocks"], response.choices[0].message.content)[
+                "blocks"
+            ]
+            blocks = json.loads(blocks)
+            for block in blocks:
+                block["error"] = False
+        except Exception:
+            parsed, unparsed = split_and_parse_json_objects(
+                response.choices[0].message.content
+            )
+            blocks = parsed
+            if unparsed:
+                blocks.append(
+                    {"index": 0, "error": True, "tags": ["error"], "content": unparsed}
+                )
+
+        if self.verbose:
+            print(
+                "[LOG] Extracted",
+                len(blocks),
+                "blocks from URL:",
+                url,
+                "block index:",
+                ix,
+            )
+        return blocks
+
+    def _merge(self, documents, chunk_token_threshold, overlap):
+        """
+        Merge documents into sections based on chunk_token_threshold and overlap.
+        """
+        # chunks = []
+        sections = []
+        total_tokens = 0
+
+        # Calculate the total tokens across all documents
+        for document in documents:
+            total_tokens += len(document.split(" ")) * self.word_token_rate
+
+        # Calculate the number of sections needed
+        num_sections = math.floor(total_tokens / chunk_token_threshold)
+        if num_sections < 1:
+            num_sections = 1  # Ensure there is at least one section
+        adjusted_chunk_threshold = total_tokens / num_sections
+
+        total_token_so_far = 0
+        current_chunk = []
+
+        for document in documents:
+            tokens = document.split(" ")
+            token_count = len(tokens) * self.word_token_rate
+
+            if total_token_so_far + token_count <= adjusted_chunk_threshold:
+                current_chunk.extend(tokens)
+                total_token_so_far += token_count
+            else:
+                # Ensure to handle the last section properly
+                if len(sections) == num_sections - 1:
+                    current_chunk.extend(tokens)
+                    continue
+
+                # Add overlap if specified
+                if overlap > 0 and current_chunk:
+                    overlap_tokens = current_chunk[-overlap:]
+                    current_chunk.extend(overlap_tokens)
+
+                sections.append(" ".join(current_chunk))
+                current_chunk = tokens
+                total_token_so_far = token_count
+
+        # Add the last chunk
+        if current_chunk:
+            sections.append(" ".join(current_chunk))
+
+        return sections
+
+    def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
+        """
+        Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
+
+        Args:
+            url: The URL of the webpage.
+            sections: List of sections (strings) to process.
+
+        Returns:
+            A list of extracted blocks or chunks.
+        """
+
+        merged_sections = self._merge(
+            sections,
+            self.chunk_token_threshold,
+            overlap=int(self.chunk_token_threshold * self.overlap_rate),
+        )
+        extracted_content = []
+        if self.provider.startswith("groq/"):
+            # Sequential processing with a delay
+            for ix, section in enumerate(merged_sections):
+                extract_func = partial(self.extract, url)
+                extracted_content.extend(
+                    extract_func(ix, sanitize_input_encode(section))
+                )
+                time.sleep(0.5)  # 500 ms delay between each processing
+        else:
+            # Parallel processing using ThreadPoolExecutor
+            # extract_func = partial(self.extract, url)
+            # for ix, section in enumerate(merged_sections):
+            #     extracted_content.append(extract_func(ix, section))
+
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                extract_func = partial(self.extract, url)
+                futures = [
+                    executor.submit(extract_func, ix, sanitize_input_encode(section))
+                    for ix, section in enumerate(merged_sections)
+                ]
+
+                for future in as_completed(futures):
+                    try:
+                        extracted_content.extend(future.result())
+                    except Exception as e:
+                        if self.verbose:
+                            print(f"Error in thread execution: {e}")
+                        # Add error information to extracted_content
+                        extracted_content.append(
+                            {
+                                "index": 0,
+                                "error": True,
+                                "tags": ["error"],
+                                "content": str(e),
+                            }
+                        )
+
+        return extracted_content
+
+    def show_usage(self) -> None:
+        """Print a detailed token usage report showing total and per-request usage."""
+        print("\n=== Token Usage Summary ===")
+        print(f"{'Type':<15} {'Count':>12}")
+        print("-" * 30)
+        print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
+        print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
+        print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
+
+        print("\n=== Usage History ===")
+        print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
+        print("-" * 48)
+        for i, usage in enumerate(self.usages, 1):
+            print(
+                f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
+            )
+
+
 #######################################################
 # New extraction strategies for JSON-based extraction #
-####################################################### 
+#######################################################
+
 
 class JsonElementExtractionStrategy(ExtractionStrategy):
     """
@@ -701,8 +813,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
         _get_element_attribute(element, attribute): Extracts an attribute's value from an element.
     """
 
-    
-    DEL = '\n'
+    DEL = "\n"
 
     def __init__(self, schema: Dict[str, Any], **kwargs):
         """
@@ -713,9 +824,11 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
         """
         super().__init__(**kwargs)
         self.schema = schema
-        self.verbose = kwargs.get('verbose', False)
+        self.verbose = kwargs.get("verbose", False)
 
-    def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
+    def extract(
+        self, url: str, html_content: str, *q, **kwargs
+    ) -> List[Dict[str, Any]]:
         """
         Extract structured data from HTML content.
 
@@ -733,27 +846,29 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
         Returns:
             List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
         """
-        
+
         parsed_html = self._parse_html(html_content)
-        base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector'])
-        
+        base_elements = self._get_base_elements(
+            parsed_html, self.schema["baseSelector"]
+        )
+
         results = []
         for element in base_elements:
             # Extract base element attributes
             item = {}
-            if 'baseFields' in self.schema:
-                for field in self.schema['baseFields']:
+            if "baseFields" in self.schema:
+                for field in self.schema["baseFields"]:
                     value = self._extract_single_field(element, field)
                     if value is not None:
-                        item[field['name']] = value
-            
+                        item[field["name"]] = value
+
             # Extract child fields
-            field_data = self._extract_item(element, self.schema['fields'])
+            field_data = self._extract_item(element, self.schema["fields"])
             item.update(field_data)
-            
+
             if item:
                 results.append(item)
-        
+
         return results
 
     @abstractmethod
@@ -773,24 +888,28 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
 
     def _extract_field(self, element, field):
         try:
-            if field['type'] == 'nested':
-                nested_elements = self._get_elements(element, field['selector'])
+            if field["type"] == "nested":
+                nested_elements = self._get_elements(element, field["selector"])
                 nested_element = nested_elements[0] if nested_elements else None
-                return self._extract_item(nested_element, field['fields']) if nested_element else {}
-            
-            if field['type'] == 'list':
-                elements = self._get_elements(element, field['selector'])
-                return [self._extract_list_item(el, field['fields']) for el in elements]
-            
-            if field['type'] == 'nested_list':
-                elements = self._get_elements(element, field['selector'])
-                return [self._extract_item(el, field['fields']) for el in elements]
-            
+                return (
+                    self._extract_item(nested_element, field["fields"])
+                    if nested_element
+                    else {}
+                )
+
+            if field["type"] == "list":
+                elements = self._get_elements(element, field["selector"])
+                return [self._extract_list_item(el, field["fields"]) for el in elements]
+
+            if field["type"] == "nested_list":
+                elements = self._get_elements(element, field["selector"])
+                return [self._extract_item(el, field["fields"]) for el in elements]
+
             return self._extract_single_field(element, field)
         except Exception as e:
             if self.verbose:
                 print(f"Error extracting field {field['name']}: {str(e)}")
-            return field.get('default')
+            return field.get("default")
 
     def _extract_single_field(self, element, field):
         """
@@ -808,38 +927,38 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
         Returns:
             Any: The extracted field value.
         """
-        
-        if 'selector' in field:
-            selected = self._get_elements(element, field['selector'])
+
+        if "selector" in field:
+            selected = self._get_elements(element, field["selector"])
             if not selected:
-                return field.get('default')
+                return field.get("default")
             selected = selected[0]
         else:
             selected = element
 
         value = None
-        if field['type'] == 'text':
+        if field["type"] == "text":
             value = self._get_element_text(selected)
-        elif field['type'] == 'attribute':
-            value = self._get_element_attribute(selected, field['attribute'])
-        elif field['type'] == 'html':
+        elif field["type"] == "attribute":
+            value = self._get_element_attribute(selected, field["attribute"])
+        elif field["type"] == "html":
             value = self._get_element_html(selected)
-        elif field['type'] == 'regex':
+        elif field["type"] == "regex":
             text = self._get_element_text(selected)
-            match = re.search(field['pattern'], text)
+            match = re.search(field["pattern"], text)
             value = match.group(1) if match else None
 
-        if 'transform' in field:
-            value = self._apply_transform(value, field['transform'])
+        if "transform" in field:
+            value = self._apply_transform(value, field["transform"])
 
-        return value if value is not None else field.get('default')
+        return value if value is not None else field.get("default")
 
     def _extract_list_item(self, element, fields):
         item = {}
         for field in fields:
             value = self._extract_single_field(element, field)
             if value is not None:
-                item[field['name']] = value
+                item[field["name"]] = value
         return item
 
     def _extract_item(self, element, fields):
@@ -858,15 +977,15 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
         Returns:
             Dict[str, Any]: A dictionary representing the extracted item.
         """
-        
+
         item = {}
         for field in fields:
-            if field['type'] == 'computed':
+            if field["type"] == "computed":
                 value = self._compute_field(item, field)
             else:
                 value = self._extract_field(element, field)
             if value is not None:
-                item[field['name']] = value
+                item[field["name"]] = value
         return item
 
     def _apply_transform(self, value, transform):
@@ -885,25 +1004,25 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
         Returns:
             str: The transformed value.
         """
-        
-        if transform == 'lowercase':
+
+        if transform == "lowercase":
             return value.lower()
-        elif transform == 'uppercase':
+        elif transform == "uppercase":
             return value.upper()
-        elif transform == 'strip':
+        elif transform == "strip":
             return value.strip()
         return value
 
     def _compute_field(self, item, field):
         try:
-            if 'expression' in field:
-                return eval(field['expression'], {}, item)
-            elif 'function' in field:
-                return field['function'](item)
+            if "expression" in field:
+                return eval(field["expression"], {}, item)
+            elif "function" in field:
+                return field["function"](item)
         except Exception as e:
             if self.verbose:
                 print(f"Error computing field {field['name']}: {str(e)}")
-            return field.get('default')
+            return field.get("default")
 
     def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
         """
@@ -922,7 +1041,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
         Returns:
             List[Dict[str, Any]]: A list of extracted items.
         """
-        
+
         combined_html = self.DEL.join(sections)
         return self.extract(url, combined_html, **kwargs)
 
@@ -941,6 +1060,73 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
         """Get attribute value from element"""
         pass
 
+    @staticmethod
+    def generate_schema(
+        html: str,
+        schema_type: str = "CSS", # or XPATH
+        query: str = None,
+        provider: str = "gpt-4o",
+        api_token: str = os.getenv("OPENAI_API_KEY"),
+        **kwargs
+    ) -> dict:
+        """
+        Generate extraction schema from HTML content and optional query.
+        
+        Args:
+            html (str): The HTML content to analyze
+            query (str, optional): Natural language description of what data to extract
+            provider (str): LLM provider to use 
+            api_token (str): API token for LLM provider
+            prompt (str, optional): Custom prompt template to use
+            **kwargs: Additional args passed to perform_completion_with_backoff
+            
+        Returns:
+            dict: Generated schema following the JsonElementExtractionStrategy format
+        """
+        from .prompts import JSON_SCHEMA_BUILDER
+        from .utils import perform_completion_with_backoff
+        
+        # Use default or custom prompt
+        prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH
+        
+        # Build the prompt
+        system_message = {
+            "role": "system", 
+            "content": "You are a specialized HTML schema generator. Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else."
+        }
+        
+        user_message = {
+            "role": "user",
+            "content": f"""
+                Instructions:
+                {prompt_template}
+
+                HTML to analyze:
+                ```html
+                {html}
+                ```
+
+                {"Extract the following data: " + query if query else "Please analyze the HTML structure and create the most appropriate schema for data extraction."}
+                """
+        }
+
+        try:
+            # Call LLM with backoff handling
+            response = perform_completion_with_backoff(
+                provider=provider,
+                prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
+                json_response = True,                
+                api_token=api_token,
+                **kwargs
+            )
+            
+            # Extract and return schema
+            return json.loads(response.choices[0].message.content)
+            
+        except Exception as e:
+            raise Exception(f"Failed to generate schema: {str(e)}")
+
+
 class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
     """
     Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
@@ -962,13 +1148,13 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
         _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
         _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
     """
-    
+
     def __init__(self, schema: Dict[str, Any], **kwargs):
-        kwargs['input_format'] = 'html'  # Force HTML input
+        kwargs["input_format"] = "html"  # Force HTML input
         super().__init__(schema, **kwargs)
 
     def _parse_html(self, html_content: str):
-        return BeautifulSoup(html_content, 'html.parser')
+        return BeautifulSoup(html_content, "html.parser")
 
     def _get_base_elements(self, parsed_html, selector: str):
         return parsed_html.select(selector)
@@ -987,6 +1173,7 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
     def _get_element_attribute(self, element, attribute: str):
         return element.get(attribute)
 
+
 class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
     """
     Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
@@ -1009,9 +1196,9 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
         _get_element_html(element): Extracts the raw HTML content of an lxml element.
         _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
     """
-    
+
     def __init__(self, schema: Dict[str, Any], **kwargs):
-        kwargs['input_format'] = 'html'  # Force HTML input
+        kwargs["input_format"] = "html"  # Force HTML input
         super().__init__(schema, **kwargs)
 
     def _parse_html(self, html_content: str):
@@ -1022,31 +1209,32 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
 
     def _css_to_xpath(self, css_selector: str) -> str:
         """Convert CSS selector to XPath if needed"""
-        if '/' in css_selector:  # Already an XPath
+        if "/" in css_selector:  # Already an XPath
             return css_selector
         return self._basic_css_to_xpath(css_selector)
 
     def _basic_css_to_xpath(self, css_selector: str) -> str:
         """Basic CSS to XPath conversion for common cases"""
-        if ' > ' in css_selector:
-            parts = css_selector.split(' > ')
-            return '//' + '/'.join(parts)
-        if ' ' in css_selector:
-            parts = css_selector.split(' ')
-            return '//' + '//'.join(parts)
-        return '//' + css_selector
+        if " > " in css_selector:
+            parts = css_selector.split(" > ")
+            return "//" + "/".join(parts)
+        if " " in css_selector:
+            parts = css_selector.split(" ")
+            return "//" + "//".join(parts)
+        return "//" + css_selector
 
     def _get_elements(self, element, selector: str):
         xpath = self._css_to_xpath(selector)
-        if not xpath.startswith('.'):
-            xpath = '.' + xpath
+        if not xpath.startswith("."):
+            xpath = "." + xpath
         return element.xpath(xpath)
 
     def _get_element_text(self, element) -> str:
-        return ''.join(element.xpath('.//text()')).strip()
+        return "".join(element.xpath(".//text()")).strip()
 
     def _get_element_html(self, element) -> str:
-        return etree.tostring(element, encoding='unicode')
+        return etree.tostring(element, encoding="unicode")
 
     def _get_element_attribute(self, element, attribute: str):
         return element.get(attribute)
+
diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py
index c41258e0..a3349e70 100644
--- a/crawl4ai/html2text/__init__.py
+++ b/crawl4ai/html2text/__init__.py
@@ -54,13 +54,13 @@ class HTML2Text(html.parser.HTMLParser):
         self.td_count = 0
         self.table_start = False
         self.unicode_snob = config.UNICODE_SNOB  # covered in cli
-        
+
         self.escape_snob = config.ESCAPE_SNOB  # covered in cli
         self.escape_backslash = config.ESCAPE_BACKSLASH  # covered in cli
         self.escape_dot = config.ESCAPE_DOT  # covered in cli
         self.escape_plus = config.ESCAPE_PLUS  # covered in cli
         self.escape_dash = config.ESCAPE_DASH  # covered in cli
-        
+
         self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
         self.body_width = bodywidth  # covered in cli
         self.skip_internal_links = config.SKIP_INTERNAL_LINKS  # covered in cli
@@ -144,8 +144,8 @@ class HTML2Text(html.parser.HTMLParser):
 
     def update_params(self, **kwargs):
         for key, value in kwargs.items():
-            setattr(self, key, value) 
-    
+            setattr(self, key, value)
+
     def feed(self, data: str) -> None:
         data = data.replace("</' + 'script>", "</ignore>")
         super().feed(data)
@@ -903,7 +903,13 @@ class HTML2Text(html.parser.HTMLParser):
                 self.empty_link = False
 
         if not self.code and not self.pre and not entity_char:
-            data = escape_md_section(data, snob=self.escape_snob, escape_dot=self.escape_dot, escape_plus=self.escape_plus, escape_dash=self.escape_dash)
+            data = escape_md_section(
+                data,
+                snob=self.escape_snob,
+                escape_dot=self.escape_dot,
+                escape_plus=self.escape_plus,
+                escape_dash=self.escape_dash,
+            )
         self.preceding_data = data
         self.o(data, puredata=True)
 
@@ -1006,6 +1012,7 @@ class HTML2Text(html.parser.HTMLParser):
                     newlines += 1
         return result
 
+
 def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
     if bodywidth is None:
         bodywidth = config.BODY_WIDTH
@@ -1013,6 +1020,7 @@ def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) ->
 
     return h.handle(html)
 
+
 class CustomHTML2Text(HTML2Text):
     def __init__(self, *args, handle_code_in_pre=False, **kwargs):
         super().__init__(*args, **kwargs)
@@ -1022,8 +1030,8 @@ class CustomHTML2Text(HTML2Text):
         self.current_preserved_tag = None
         self.preserved_content = []
         self.preserve_depth = 0
-        self.handle_code_in_pre = handle_code_in_pre 
-        
+        self.handle_code_in_pre = handle_code_in_pre
+
         # Configuration options
         self.skip_internal_links = False
         self.single_line_break = False
@@ -1041,9 +1049,9 @@ class CustomHTML2Text(HTML2Text):
     def update_params(self, **kwargs):
         """Update parameters and set preserved tags."""
         for key, value in kwargs.items():
-            if key == 'preserve_tags':
+            if key == "preserve_tags":
                 self.preserve_tags = set(value)
-            elif key == 'handle_code_in_pre':
+            elif key == "handle_code_in_pre":
                 self.handle_code_in_pre = value
             else:
                 setattr(self, key, value)
@@ -1056,17 +1064,19 @@ class CustomHTML2Text(HTML2Text):
                     self.current_preserved_tag = tag
                     self.preserved_content = []
                     # Format opening tag with attributes
-                    attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
-                    self.preserved_content.append(f'<{tag}{attr_str}>')
+                    attr_str = "".join(
+                        f' {k}="{v}"' for k, v in attrs.items() if v is not None
+                    )
+                    self.preserved_content.append(f"<{tag}{attr_str}>")
                 self.preserve_depth += 1
                 return
             else:
                 self.preserve_depth -= 1
                 if self.preserve_depth == 0:
-                    self.preserved_content.append(f'</{tag}>')
+                    self.preserved_content.append(f"</{tag}>")
                     # Output the preserved HTML block with proper spacing
-                    preserved_html = ''.join(self.preserved_content)
-                    self.o('\n' + preserved_html + '\n')
+                    preserved_html = "".join(self.preserved_content)
+                    self.o("\n" + preserved_html + "\n")
                     self.current_preserved_tag = None
                 return
 
@@ -1074,29 +1084,31 @@ class CustomHTML2Text(HTML2Text):
         if self.preserve_depth > 0:
             if start:
                 # Format nested tags with attributes
-                attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
-                self.preserved_content.append(f'<{tag}{attr_str}>')
+                attr_str = "".join(
+                    f' {k}="{v}"' for k, v in attrs.items() if v is not None
+                )
+                self.preserved_content.append(f"<{tag}{attr_str}>")
             else:
-                self.preserved_content.append(f'</{tag}>')
+                self.preserved_content.append(f"</{tag}>")
             return
 
         # Handle pre tags
-        if tag == 'pre':
+        if tag == "pre":
             if start:
-                self.o('```\n')  # Markdown code block start
+                self.o("```\n")  # Markdown code block start
                 self.inside_pre = True
             else:
-                self.o('\n```\n')  # Markdown code block end
+                self.o("\n```\n")  # Markdown code block end
                 self.inside_pre = False
-        elif tag == 'code':
+        elif tag == "code":
             if self.inside_pre and not self.handle_code_in_pre:
                 # Ignore code tags inside pre blocks if handle_code_in_pre is False
                 return
             if start:
-                self.o('`')  # Markdown inline code start
+                self.o("`")  # Markdown inline code start
                 self.inside_code = True
             else:
-                self.o('`')  # Markdown inline code end
+                self.o("`")  # Markdown inline code end
                 self.inside_code = False
         else:
             super().handle_tag(tag, attrs, start)
@@ -1113,13 +1125,12 @@ class CustomHTML2Text(HTML2Text):
             return
         if self.inside_code:
             # Inline code: no newlines allowed
-            self.o(data.replace('\n', ' '))
+            self.o(data.replace("\n", " "))
             return
 
         # Default behavior for other tags
         super().handle_data(data, entity_char)
 
-
     #     # Handle pre tags
     #     if tag == 'pre':
     #         if start:
diff --git a/crawl4ai/html2text/_typing.py b/crawl4ai/html2text/_typing.py
index eed83251..6e17fed2 100644
--- a/crawl4ai/html2text/_typing.py
+++ b/crawl4ai/html2text/_typing.py
@@ -1,2 +1,3 @@
 class OutCallback:
-    def __call__(self, s: str) -> None: ...
+    def __call__(self, s: str) -> None:
+        ...
diff --git a/crawl4ai/html2text/utils.py b/crawl4ai/html2text/utils.py
index 1909d2cf..21bf98fb 100644
--- a/crawl4ai/html2text/utils.py
+++ b/crawl4ai/html2text/utils.py
@@ -210,7 +210,7 @@ def escape_md_section(
     snob: bool = False,
     escape_dot: bool = True,
     escape_plus: bool = True,
-    escape_dash: bool = True
+    escape_dash: bool = True,
 ) -> str:
     """
     Escapes markdown-sensitive characters across whole document sections.
@@ -233,6 +233,7 @@ def escape_md_section(
 
     return text
 
+
 def reformat_table(lines: List[str], right_margin: int) -> List[str]:
     """
     Given the lines of a table
diff --git a/crawl4ai/install.py b/crawl4ai/install.py
index 7efb6800..139be591 100644
--- a/crawl4ai/install.py
+++ b/crawl4ai/install.py
@@ -6,25 +6,44 @@ from .async_logger import AsyncLogger, LogLevel
 # Initialize logger
 logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
 
+
 def post_install():
     """Run all post-installation tasks"""
     logger.info("Running post-installation setup...", tag="INIT")
     install_playwright()
     run_migration()
     logger.success("Post-installation setup completed!", tag="COMPLETE")
-    
+
+
 def install_playwright():
     logger.info("Installing Playwright browsers...", tag="INIT")
     try:
         # subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"])
-        subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"])
-        logger.success("Playwright installation completed successfully.", tag="COMPLETE")
-    except subprocess.CalledProcessError as e:
+        subprocess.check_call(
+            [
+                sys.executable,
+                "-m",
+                "playwright",
+                "install",
+                "--with-deps",
+                "--force",
+                "chromium",
+            ]
+        )
+        logger.success(
+            "Playwright installation completed successfully.", tag="COMPLETE"
+        )
+    except subprocess.CalledProcessError:
         # logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
-        logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
-    except Exception as e:
+        logger.warning(
+            f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation."
+        )
+    except Exception:
         # logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
-        logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
+        logger.warning(
+            f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation."
+        )
+
 
 def run_migration():
     """Initialize database during installation"""
@@ -33,18 +52,26 @@ def run_migration():
         from crawl4ai.async_database import async_db_manager
 
         asyncio.run(async_db_manager.initialize())
-        logger.success("Database initialization completed successfully.", tag="COMPLETE")
+        logger.success(
+            "Database initialization completed successfully.", tag="COMPLETE"
+        )
     except ImportError:
         logger.warning("Database module not found. Will initialize on first use.")
     except Exception as e:
         logger.warning(f"Database initialization failed: {e}")
         logger.warning("Database will be initialized on first use")
 
+
 async def run_doctor():
     """Test if Crawl4AI is working properly"""
     logger.info("Running Crawl4AI health check...", tag="INIT")
     try:
-        from .async_webcrawler import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+        from .async_webcrawler import (
+            AsyncWebCrawler,
+            BrowserConfig,
+            CrawlerRunConfig,
+            CacheMode,
+        )
 
         browser_config = BrowserConfig(
             headless=True,
@@ -52,7 +79,7 @@ async def run_doctor():
             ignore_https_errors=True,
             light_mode=True,
             viewport_width=1280,
-            viewport_height=720
+            viewport_height=720,
         )
 
         run_config = CrawlerRunConfig(
@@ -62,10 +89,7 @@ async def run_doctor():
 
         async with AsyncWebCrawler(config=browser_config) as crawler:
             logger.info("Testing crawling capabilities...", tag="TEST")
-            result = await crawler.arun(
-                url="https://crawl4ai.com",
-                config=run_config
-            )
+            result = await crawler.arun(url="https://crawl4ai.com", config=run_config)
 
             if result and result.markdown:
                 logger.success("✅ Crawling test passed!", tag="COMPLETE")
@@ -77,7 +101,9 @@ async def run_doctor():
         logger.error(f"❌ Test failed: {e}", tag="ERROR")
         return False
 
+
 def doctor():
     """Entry point for the doctor command"""
     import asyncio
+
     return asyncio.run(run_doctor())
diff --git a/crawl4ai/js_snippet/__init__.py b/crawl4ai/js_snippet/__init__.py
index 73b0c2dd..e51f79d8 100644
--- a/crawl4ai/js_snippet/__init__.py
+++ b/crawl4ai/js_snippet/__init__.py
@@ -1,15 +1,18 @@
-import os, sys
+import os
+
 
 # Create a function get name of a js script, then load from the CURRENT folder of this script and return its content as string, make sure its error free
 def load_js_script(script_name):
     # Get the path of the current script
     current_script_path = os.path.dirname(os.path.realpath(__file__))
     # Get the path of the script to load
-    script_path = os.path.join(current_script_path, script_name + '.js')
+    script_path = os.path.join(current_script_path, script_name + ".js")
     # Check if the script exists
     if not os.path.exists(script_path):
-        raise ValueError(f"Script {script_name} not found in the folder {current_script_path}")
+        raise ValueError(
+            f"Script {script_name} not found in the folder {current_script_path}"
+        )
     # Load the content of the script
-    with open(script_path, 'r') as f:
+    with open(script_path, "r") as f:
         script_content = f.read()
     return script_content
diff --git a/crawl4ai/llmtxt.py b/crawl4ai/llmtxt.py
index 94efe076..30256416 100644
--- a/crawl4ai/llmtxt.py
+++ b/crawl4ai/llmtxt.py
@@ -11,16 +11,16 @@ from rank_bm25 import BM25Okapi
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
-from litellm import completion, batch_completion
+from litellm import batch_completion
 from .async_logger import AsyncLogger
 import litellm
 import pickle
 import hashlib  # <--- ADDED for file-hash
-from fnmatch import fnmatch
 import glob
 
 litellm.set_verbose = False
 
+
 def _compute_file_hash(file_path: Path) -> str:
     """Compute MD5 hash for the file's entire content."""
     hash_md5 = hashlib.md5()
@@ -29,13 +29,14 @@ def _compute_file_hash(file_path: Path) -> str:
             hash_md5.update(chunk)
     return hash_md5.hexdigest()
 
+
 class AsyncLLMTextManager:
     def __init__(
         self,
         docs_dir: Path,
         logger: Optional[AsyncLogger] = None,
         max_concurrent_calls: int = 5,
-        batch_size: int = 3
+        batch_size: int = 3,
     ) -> None:
         self.docs_dir = docs_dir
         self.logger = logger
@@ -51,7 +52,7 @@ class AsyncLLMTextManager:
         contents = []
         for file_path in doc_batch:
             try:
-                with open(file_path, 'r', encoding='utf-8') as f:
+                with open(file_path, "r", encoding="utf-8") as f:
                     contents.append(f.read())
             except Exception as e:
                 self.logger.error(f"Error reading {file_path}: {str(e)}")
@@ -77,43 +78,53 @@ Wrap your response in <index>...</index> tags.
         # Prepare messages for batch processing
         messages_list = [
             [
-                {"role": "user", "content": f"{prompt}\n\nGenerate index for this documentation:\n\n{content}"}
+                {
+                    "role": "user",
+                    "content": f"{prompt}\n\nGenerate index for this documentation:\n\n{content}",
+                }
             ]
-            for content in contents if content
+            for content in contents
+            if content
         ]
 
         try:
             responses = batch_completion(
                 model="anthropic/claude-3-5-sonnet-latest",
                 messages=messages_list,
-                logger_fn=None
+                logger_fn=None,
             )
 
             # Process responses and save index files
             for response, file_path in zip(responses, doc_batch):
                 try:
                     index_content_match = re.search(
-                        r'<index>(.*?)</index>',
+                        r"<index>(.*?)</index>",
                         response.choices[0].message.content,
-                        re.DOTALL
+                        re.DOTALL,
                     )
                     if not index_content_match:
-                        self.logger.warning(f"No <index>...</index> content found for {file_path}")
+                        self.logger.warning(
+                            f"No <index>...</index> content found for {file_path}"
+                        )
                         continue
 
                     index_content = re.sub(
                         r"\n\s*\n", "\n", index_content_match.group(1)
                     ).strip()
                     if index_content:
-                        index_file = file_path.with_suffix('.q.md')
-                        with open(index_file, 'w', encoding='utf-8') as f:
+                        index_file = file_path.with_suffix(".q.md")
+                        with open(index_file, "w", encoding="utf-8") as f:
                             f.write(index_content)
                         self.logger.info(f"Created index file: {index_file}")
                     else:
-                        self.logger.warning(f"No index content found in response for {file_path}")
+                        self.logger.warning(
+                            f"No index content found in response for {file_path}"
+                        )
 
                 except Exception as e:
-                    self.logger.error(f"Error processing response for {file_path}: {str(e)}")
+                    self.logger.error(
+                        f"Error processing response for {file_path}: {str(e)}"
+                    )
 
         except Exception as e:
             self.logger.error(f"Error in batch completion: {str(e)}")
@@ -171,7 +182,12 @@ Wrap your response in <index>...</index> tags.
 
         lemmatizer = WordNetLemmatizer()
         stop_words = set(stopwords.words("english")) - {
-            "how", "what", "when", "where", "why", "which",
+            "how",
+            "what",
+            "when",
+            "where",
+            "why",
+            "which",
         }
 
         tokens = []
@@ -222,7 +238,9 @@ Wrap your response in <index>...</index> tags.
         self.logger.info("Checking which .q.md files need (re)indexing...")
 
         # Gather all .q.md files
-        q_files = [self.docs_dir / f for f in os.listdir(self.docs_dir) if f.endswith(".q.md")]
+        q_files = [
+            self.docs_dir / f for f in os.listdir(self.docs_dir) if f.endswith(".q.md")
+        ]
 
         # We'll store known (unchanged) facts in these lists
         existing_facts: List[str] = []
@@ -243,7 +261,9 @@ Wrap your response in <index>...</index> tags.
             # Otherwise, load the existing cache and compare hash
             cache = self._load_or_create_token_cache(qf)
             # If the .q.tokens was out of date (i.e. changed hash), we reindex
-            if len(cache["facts"]) == 0 or cache.get("content_hash") != _compute_file_hash(qf):
+            if len(cache["facts"]) == 0 or cache.get(
+                "content_hash"
+            ) != _compute_file_hash(qf):
                 needSet.append(qf)
             else:
                 # File is unchanged → retrieve cached token data
@@ -255,20 +275,29 @@ Wrap your response in <index>...</index> tags.
         if not needSet and not clear_cache:
             # If no file needs reindexing, try loading existing index
             if self.maybe_load_bm25_index(clear_cache=False):
-                self.logger.info("No new/changed .q.md files found. Using existing BM25 index.")
+                self.logger.info(
+                    "No new/changed .q.md files found. Using existing BM25 index."
+                )
                 return
             else:
                 # If there's no existing index, we must build a fresh index from the old caches
-                self.logger.info("No existing BM25 index found. Building from cached facts.")
+                self.logger.info(
+                    "No existing BM25 index found. Building from cached facts."
+                )
                 if existing_facts:
-                    self.logger.info(f"Building BM25 index with {len(existing_facts)} cached facts.")
+                    self.logger.info(
+                        f"Building BM25 index with {len(existing_facts)} cached facts."
+                    )
                     self.bm25_index = BM25Okapi(existing_tokens)
                     self.tokenized_facts = existing_facts
                     with open(self.bm25_index_file, "wb") as f:
-                        pickle.dump({
-                            "bm25_index": self.bm25_index,
-                            "tokenized_facts": self.tokenized_facts
-                        }, f)
+                        pickle.dump(
+                            {
+                                "bm25_index": self.bm25_index,
+                                "tokenized_facts": self.tokenized_facts,
+                            },
+                            f,
+                        )
                 else:
                     self.logger.warning("No facts found at all. Index remains empty.")
                 return
@@ -311,7 +340,9 @@ Wrap your response in <index>...</index> tags.
                     self._save_token_cache(file, fresh_cache)
 
                     mem_usage = process.memory_info().rss / 1024 / 1024
-                    self.logger.debug(f"Memory usage after {file.name}: {mem_usage:.2f}MB")
+                    self.logger.debug(
+                        f"Memory usage after {file.name}: {mem_usage:.2f}MB"
+                    )
 
                 except Exception as e:
                     self.logger.error(f"Error processing {file}: {str(e)}")
@@ -328,40 +359,49 @@ Wrap your response in <index>...</index> tags.
         all_tokens = existing_tokens + new_tokens
 
         # 3) Build BM25 index from combined facts
-        self.logger.info(f"Building BM25 index with {len(all_facts)} total facts (old + new).")
+        self.logger.info(
+            f"Building BM25 index with {len(all_facts)} total facts (old + new)."
+        )
         self.bm25_index = BM25Okapi(all_tokens)
         self.tokenized_facts = all_facts
 
         # 4) Save the updated BM25 index to disk
         with open(self.bm25_index_file, "wb") as f:
-            pickle.dump({
-                "bm25_index": self.bm25_index,
-                "tokenized_facts": self.tokenized_facts
-            }, f)
+            pickle.dump(
+                {
+                    "bm25_index": self.bm25_index,
+                    "tokenized_facts": self.tokenized_facts,
+                },
+                f,
+            )
 
         final_mem = process.memory_info().rss / 1024 / 1024
         self.logger.info(f"Search index updated. Final memory usage: {final_mem:.2f}MB")
 
-    async def generate_index_files(self, force_generate_facts: bool = False, clear_bm25_cache: bool = False) -> None:
+    async def generate_index_files(
+        self, force_generate_facts: bool = False, clear_bm25_cache: bool = False
+    ) -> None:
         """
         Generate index files for all documents in parallel batches
-        
+
         Args:
             force_generate_facts (bool): If True, regenerate indexes even if they exist
             clear_bm25_cache (bool): If True, clear existing BM25 index cache
         """
         self.logger.info("Starting index generation for documentation files.")
-        
+
         md_files = [
-            self.docs_dir / f for f in os.listdir(self.docs_dir) 
-            if f.endswith('.md') and not any(f.endswith(x) for x in ['.q.md', '.xs.md'])
+            self.docs_dir / f
+            for f in os.listdir(self.docs_dir)
+            if f.endswith(".md") and not any(f.endswith(x) for x in [".q.md", ".xs.md"])
         ]
 
         # Filter out files that already have .q files unless force=True
         if not force_generate_facts:
             md_files = [
-                f for f in md_files 
-                if not (self.docs_dir / f.name.replace('.md', '.q.md')).exists()
+                f
+                for f in md_files
+                if not (self.docs_dir / f.name.replace(".md", ".q.md")).exists()
             ]
 
         if not md_files:
@@ -369,8 +409,10 @@ Wrap your response in <index>...</index> tags.
         else:
             # Process documents in batches
             for i in range(0, len(md_files), self.batch_size):
-                batch = md_files[i:i + self.batch_size]
-                self.logger.info(f"Processing batch {i//self.batch_size + 1}/{(len(md_files)//self.batch_size) + 1}")
+                batch = md_files[i : i + self.batch_size]
+                self.logger.info(
+                    f"Processing batch {i//self.batch_size + 1}/{(len(md_files)//self.batch_size) + 1}"
+                )
                 await self._process_document_batch(batch)
 
         self.logger.info("Index generation complete, building/updating search index.")
@@ -378,21 +420,31 @@ Wrap your response in <index>...</index> tags.
 
     def generate(self, sections: List[str], mode: str = "extended") -> str:
         # Get all markdown files
-        all_files = glob.glob(str(self.docs_dir / "[0-9]*.md")) + \
-                    glob.glob(str(self.docs_dir / "[0-9]*.xs.md"))
-        
+        all_files = glob.glob(str(self.docs_dir / "[0-9]*.md")) + glob.glob(
+            str(self.docs_dir / "[0-9]*.xs.md")
+        )
+
         # Extract base names without extensions
-        base_docs = {Path(f).name.split('.')[0] for f in all_files 
-                        if not Path(f).name.endswith('.q.md')}
-        
+        base_docs = {
+            Path(f).name.split(".")[0]
+            for f in all_files
+            if not Path(f).name.endswith(".q.md")
+        }
+
         # Filter by sections if provided
         if sections:
-            base_docs = {doc for doc in base_docs 
-                        if any(section.lower() in doc.lower() for section in sections)}
-        
+            base_docs = {
+                doc
+                for doc in base_docs
+                if any(section.lower() in doc.lower() for section in sections)
+            }
+
         # Get file paths based on mode
         files = []
-        for doc in sorted(base_docs, key=lambda x: int(x.split('_')[0]) if x.split('_')[0].isdigit() else 999999):
+        for doc in sorted(
+            base_docs,
+            key=lambda x: int(x.split("_")[0]) if x.split("_")[0].isdigit() else 999999,
+        ):
             if mode == "condensed":
                 xs_file = self.docs_dir / f"{doc}.xs.md"
                 regular_file = self.docs_dir / f"{doc}.md"
@@ -404,7 +456,7 @@ Wrap your response in <index>...</index> tags.
         content = []
         for file in files:
             try:
-                with open(file, 'r', encoding='utf-8') as f:
+                with open(file, "r", encoding="utf-8") as f:
                     fname = Path(file).name
                     content.append(f"{'#'*20}\n# {fname}\n{'#'*20}\n\n{f.read()}")
             except Exception as e:
@@ -443,15 +495,9 @@ Wrap your response in <index>...</index> tags.
         for file, _ in ranked_files:
             main_doc = str(file).replace(".q.md", ".md")
             if os.path.exists(self.docs_dir / main_doc):
-                with open(self.docs_dir / main_doc, "r", encoding='utf-8') as f:
+                with open(self.docs_dir / main_doc, "r", encoding="utf-8") as f:
                     only_file_name = main_doc.split("/")[-1]
-                    content = [
-                    "#" * 20,
-                    f"# {only_file_name}",
-                    "#" * 20,
-                    "",
-                    f.read()
-                    ]
+                    content = ["#" * 20, f"# {only_file_name}", "#" * 20, "", f.read()]
                     results.append("\n".join(content))
 
         return "\n\n---\n\n".join(results)
@@ -482,7 +528,9 @@ Wrap your response in <index>...</index> tags.
             if len(components) == 3:
                 code_ref = components[2].strip()
                 code_tokens = self.preprocess_text(code_ref)
-                code_match_score = len(set(query_tokens) & set(code_tokens)) / len(query_tokens)
+                code_match_score = len(set(query_tokens) & set(code_tokens)) / len(
+                    query_tokens
+                )
 
             file_data[file_path]["total_score"] += score
             file_data[file_path]["match_count"] += 1
diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py
index 89e5e34e..1e3f0554 100644
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -2,77 +2,94 @@ from abc import ABC, abstractmethod
 from typing import Optional, Dict, Any, Tuple
 from .models import MarkdownGenerationResult
 from .html2text import CustomHTML2Text
-from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
+from .content_filter_strategy import RelevantContentFilter
 import re
 from urllib.parse import urljoin
 
 # Pre-compile the regex pattern
 LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
 
+
 def fast_urljoin(base: str, url: str) -> str:
     """Fast URL joining for common cases."""
-    if url.startswith(('http://', 'https://', 'mailto:', '//')):
+    if url.startswith(("http://", "https://", "mailto:", "//")):
         return url
-    if url.startswith('/'):
+    if url.startswith("/"):
         # Handle absolute paths
-        if base.endswith('/'):
+        if base.endswith("/"):
             return base[:-1] + url
         return base + url
     return urljoin(base, url)
 
+
 class MarkdownGenerationStrategy(ABC):
     """Abstract base class for markdown generation strategies."""
-    def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
+
+    def __init__(
+        self,
+        content_filter: Optional[RelevantContentFilter] = None,
+        options: Optional[Dict[str, Any]] = None,
+    ):
         self.content_filter = content_filter
         self.options = options or {}
-    
+
     @abstractmethod
-    def generate_markdown(self, 
-                         cleaned_html: str, 
-                         base_url: str = "",
-                         html2text_options: Optional[Dict[str, Any]] = None,
-                         content_filter: Optional[RelevantContentFilter] = None,
-                         citations: bool = True,
-                         **kwargs) -> MarkdownGenerationResult:
+    def generate_markdown(
+        self,
+        cleaned_html: str,
+        base_url: str = "",
+        html2text_options: Optional[Dict[str, Any]] = None,
+        content_filter: Optional[RelevantContentFilter] = None,
+        citations: bool = True,
+        **kwargs,
+    ) -> MarkdownGenerationResult:
         """Generate markdown from cleaned HTML."""
         pass
 
+
 class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
     """
     Default implementation of markdown generation strategy.
-    
+
     How it works:
     1. Generate raw markdown from cleaned HTML.
     2. Convert links to citations.
     3. Generate fit markdown if content filter is provided.
     4. Return MarkdownGenerationResult.
-    
+
     Args:
         content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
         options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
-        
+
     Returns:
         MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
     """
-    def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
+
+    def __init__(
+        self,
+        content_filter: Optional[RelevantContentFilter] = None,
+        options: Optional[Dict[str, Any]] = None,
+    ):
         super().__init__(content_filter, options)
-    
-    def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
+
+    def convert_links_to_citations(
+        self, markdown: str, base_url: str = ""
+    ) -> Tuple[str, str]:
         """
         Convert links in markdown to citations.
-        
+
         How it works:
         1. Find all links in the markdown.
         2. Convert links to citations.
         3. Return converted markdown and references markdown.
-        
+
         Note:
         This function uses a regex pattern to find links in markdown.
-        
+
         Args:
             markdown (str): Markdown text.
             base_url (str): Base URL for URL joins.
-            
+
         Returns:
             Tuple[str, str]: Converted markdown and references markdown.
         """
@@ -81,57 +98,65 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
         parts = []
         last_end = 0
         counter = 1
-        
+
         for match in LINK_PATTERN.finditer(markdown):
-            parts.append(markdown[last_end:match.start()])
+            parts.append(markdown[last_end : match.start()])
             text, url, title = match.groups()
-            
+
             # Use cached URL if available, otherwise compute and cache
-            if base_url and not url.startswith(('http://', 'https://', 'mailto:')):
+            if base_url and not url.startswith(("http://", "https://", "mailto:")):
                 if url not in url_cache:
                     url_cache[url] = fast_urljoin(base_url, url)
                 url = url_cache[url]
-                
+
             if url not in link_map:
                 desc = []
-                if title: desc.append(title)
-                if text and text != title: desc.append(text)
+                if title:
+                    desc.append(title)
+                if text and text != title:
+                    desc.append(text)
                 link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
                 counter += 1
-                
+
             num = link_map[url][0]
-            parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]")
+            parts.append(
+                f"{text}⟨{num}⟩"
+                if not match.group(0).startswith("!")
+                else f"![{text}⟨{num}⟩]"
+            )
             last_end = match.end()
-        
+
         parts.append(markdown[last_end:])
-        converted_text = ''.join(parts)
-        
+        converted_text = "".join(parts)
+
         # Pre-build reference strings
         references = ["\n\n## References\n\n"]
         references.extend(
-            f"⟨{num}⟩ {url}{desc}\n" 
+            f"⟨{num}⟩ {url}{desc}\n"
             for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
         )
-        
-        return converted_text, ''.join(references)
 
-    def generate_markdown(self, 
-                         cleaned_html: str, 
-                         base_url: str = "",
-                         html2text_options: Optional[Dict[str, Any]] = None,
-                         options: Optional[Dict[str, Any]] = None,
-                         content_filter: Optional[RelevantContentFilter] = None,
-                         citations: bool = True,
-                         **kwargs) -> MarkdownGenerationResult:
+        return converted_text, "".join(references)
+
+    def generate_markdown(
+        self,
+        cleaned_html: str,
+        base_url: str = "",
+        html2text_options: Optional[Dict[str, Any]] = None,
+        options: Optional[Dict[str, Any]] = None,
+        content_filter: Optional[RelevantContentFilter] = None,
+        citations: bool = True,
+        **kwargs,
+    ) -> MarkdownGenerationResult:
         """
         Generate markdown with citations from cleaned HTML.
-        
+
         How it works:
         1. Generate raw markdown from cleaned HTML.
         2. Convert links to citations.
         3. Generate fit markdown if content filter is provided.
         4. Return MarkdownGenerationResult.
-        
+
         Args:
             cleaned_html (str): Cleaned HTML content.
             base_url (str): Base URL for URL joins.
@@ -139,7 +164,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
             options (Optional[Dict[str, Any]]): Additional options for markdown generation.
             content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
             citations (bool): Whether to generate citations.
-            
+
         Returns:
             MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
         """
@@ -147,16 +172,16 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
             # Initialize HTML2Text with default options for better conversion
             h = CustomHTML2Text(baseurl=base_url)
             default_options = {
-                'body_width': 0,  # Disable text wrapping
-                'ignore_emphasis': False,
-                'ignore_links': False,
-                'ignore_images': False,
-                'protect_links': True,
-                'single_line_break': True,
-                'mark_code': True,
-                'escape_snob': False
+                "body_width": 0,  # Disable text wrapping
+                "ignore_emphasis": False,
+                "ignore_links": False,
+                "ignore_images": False,
+                "protect_links": True,
+                "single_line_break": True,
+                "mark_code": True,
+                "escape_snob": False,
             }
-            
+
             # Update with custom options if provided
             if html2text_options:
                 default_options.update(html2text_options)
@@ -164,7 +189,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
                 default_options.update(options)
             elif self.options:
                 default_options.update(self.options)
-            
+
             h.update_params(**default_options)
 
             # Ensure we have valid input
@@ -178,17 +203,18 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
                 raw_markdown = h.handle(cleaned_html)
             except Exception as e:
                 raw_markdown = f"Error converting HTML to markdown: {str(e)}"
-            
-            raw_markdown = raw_markdown.replace('    ```', '```')
+
+            raw_markdown = raw_markdown.replace("    ```", "```")
 
             # Convert links to citations
             markdown_with_citations: str = raw_markdown
             references_markdown: str = ""
             if citations:
                 try:
-                    markdown_with_citations, references_markdown = self.convert_links_to_citations(
-                        raw_markdown, base_url
-                    )
+                    (
+                        markdown_with_citations,
+                        references_markdown,
+                    ) = self.convert_links_to_citations(raw_markdown, base_url)
                 except Exception as e:
                     markdown_with_citations = raw_markdown
                     references_markdown = f"Error generating citations: {str(e)}"
@@ -200,7 +226,9 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
                 try:
                     content_filter = content_filter or self.content_filter
                     filtered_html = content_filter.filter_content(cleaned_html)
-                    filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
+                    filtered_html = "\n".join(
+                        "<div>{}</div>".format(s) for s in filtered_html
+                    )
                     fit_markdown = h.handle(filtered_html)
                 except Exception as e:
                     fit_markdown = f"Error generating fit markdown: {str(e)}"
diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py
index 3386b0fb..d6da292f 100644
--- a/crawl4ai/migrations.py
+++ b/crawl4ai/migrations.py
@@ -1,13 +1,11 @@
 import os
 import asyncio
-import logging
 from pathlib import Path
 import aiosqlite
 from typing import Optional
 import xxhash
 import aiofiles
 import shutil
-import time
 from datetime import datetime
 from .async_logger import AsyncLogger, LogLevel
 
@@ -17,18 +15,19 @@ logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
 # logging.basicConfig(level=logging.INFO)
 # logger = logging.getLogger(__name__)
 
+
 class DatabaseMigration:
     def __init__(self, db_path: str):
         self.db_path = db_path
         self.content_paths = self._ensure_content_dirs(os.path.dirname(db_path))
-        
+
     def _ensure_content_dirs(self, base_path: str) -> dict:
         dirs = {
-            'html': 'html_content',
-            'cleaned': 'cleaned_html',
-            'markdown': 'markdown_content', 
-            'extracted': 'extracted_content',
-            'screenshots': 'screenshots'
+            "html": "html_content",
+            "cleaned": "cleaned_html",
+            "markdown": "markdown_content",
+            "extracted": "extracted_content",
+            "screenshots": "screenshots",
         }
         content_paths = {}
         for key, dirname in dirs.items():
@@ -47,43 +46,55 @@ class DatabaseMigration:
     async def _store_content(self, content: str, content_type: str) -> str:
         if not content:
             return ""
-        
+
         content_hash = self._generate_content_hash(content)
         file_path = os.path.join(self.content_paths[content_type], content_hash)
-        
+
         if not os.path.exists(file_path):
-            async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
+            async with aiofiles.open(file_path, "w", encoding="utf-8") as f:
                 await f.write(content)
-                
+
         return content_hash
 
     async def migrate_database(self):
         """Migrate existing database to file-based storage"""
         # logger.info("Starting database migration...")
         logger.info("Starting database migration...", tag="INIT")
-        
+
         try:
             async with aiosqlite.connect(self.db_path) as db:
                 # Get all rows
                 async with db.execute(
-                    '''SELECT url, html, cleaned_html, markdown, 
-                       extracted_content, screenshot FROM crawled_data'''
+                    """SELECT url, html, cleaned_html, markdown, 
+                       extracted_content, screenshot FROM crawled_data"""
                 ) as cursor:
                     rows = await cursor.fetchall()
 
                 migrated_count = 0
                 for row in rows:
-                    url, html, cleaned_html, markdown, extracted_content, screenshot = row
-                    
+                    (
+                        url,
+                        html,
+                        cleaned_html,
+                        markdown,
+                        extracted_content,
+                        screenshot,
+                    ) = row
+
                     # Store content in files and get hashes
-                    html_hash = await self._store_content(html, 'html')
-                    cleaned_hash = await self._store_content(cleaned_html, 'cleaned')
-                    markdown_hash = await self._store_content(markdown, 'markdown')
-                    extracted_hash = await self._store_content(extracted_content, 'extracted')
-                    screenshot_hash = await self._store_content(screenshot, 'screenshots')
+                    html_hash = await self._store_content(html, "html")
+                    cleaned_hash = await self._store_content(cleaned_html, "cleaned")
+                    markdown_hash = await self._store_content(markdown, "markdown")
+                    extracted_hash = await self._store_content(
+                        extracted_content, "extracted"
+                    )
+                    screenshot_hash = await self._store_content(
+                        screenshot, "screenshots"
+                    )
 
                     # Update database with hashes
-                    await db.execute('''
+                    await db.execute(
+                        """
                         UPDATE crawled_data 
                         SET html = ?, 
                             cleaned_html = ?,
@@ -91,40 +102,51 @@ class DatabaseMigration:
                             extracted_content = ?,
                             screenshot = ?
                         WHERE url = ?
-                    ''', (html_hash, cleaned_hash, markdown_hash, 
-                         extracted_hash, screenshot_hash, url))
-                    
+                    """,
+                        (
+                            html_hash,
+                            cleaned_hash,
+                            markdown_hash,
+                            extracted_hash,
+                            screenshot_hash,
+                            url,
+                        ),
+                    )
+
                     migrated_count += 1
                     if migrated_count % 100 == 0:
                         logger.info(f"Migrated {migrated_count} records...", tag="INIT")
-                        
 
                 await db.commit()
-                logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE")
+                logger.success(
+                    f"Migration completed. {migrated_count} records processed.",
+                    tag="COMPLETE",
+                )
 
         except Exception as e:
             # logger.error(f"Migration failed: {e}")
             logger.error(
                 message="Migration failed: {error}",
                 tag="ERROR",
-                params={"error": str(e)}
+                params={"error": str(e)},
             )
             raise e
 
+
 async def backup_database(db_path: str) -> str:
     """Create backup of existing database"""
     if not os.path.exists(db_path):
         logger.info("No existing database found. Skipping backup.", tag="INIT")
         return None
-        
+
     # Create backup with timestamp
-    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     backup_path = f"{db_path}.backup_{timestamp}"
-    
+
     try:
         # Wait for any potential write operations to finish
         await asyncio.sleep(1)
-        
+
         # Create backup
         shutil.copy2(db_path, backup_path)
         logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE")
@@ -132,37 +154,41 @@ async def backup_database(db_path: str) -> str:
     except Exception as e:
         # logger.error(f"Backup failed: {e}")
         logger.error(
-                message="Migration failed: {error}",
-                tag="ERROR",
-                params={"error": str(e)}
-            )
+            message="Migration failed: {error}", tag="ERROR", params={"error": str(e)}
+        )
         raise e
-    
+
+
 async def run_migration(db_path: Optional[str] = None):
     """Run database migration"""
     if db_path is None:
         db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
-    
+
     if not os.path.exists(db_path):
         logger.info("No existing database found. Skipping migration.", tag="INIT")
         return
-        
+
     # Create backup first
     backup_path = await backup_database(db_path)
     if not backup_path:
         return
-    
+
     migration = DatabaseMigration(db_path)
     await migration.migrate_database()
-    
+
+
 def main():
     """CLI entry point for migration"""
     import argparse
-    parser = argparse.ArgumentParser(description='Migrate Crawl4AI database to file-based storage')
-    parser.add_argument('--db-path', help='Custom database path')
+
+    parser = argparse.ArgumentParser(
+        description="Migrate Crawl4AI database to file-based storage"
+    )
+    parser.add_argument("--db-path", help="Custom database path")
     args = parser.parse_args()
-    
+
     asyncio.run(run_migration(args.db_path))
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py
index d1872d7e..aa80f673 100644
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -2,109 +2,125 @@ from functools import lru_cache
 from pathlib import Path
 import subprocess, os
 import shutil
-import tarfile
 from .model_loader import *
 import argparse
-import urllib.request
 from crawl4ai.config import MODEL_REPO_BRANCH
+
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 
+
 @lru_cache()
 def get_available_memory(device):
     import torch
-    if device.type == 'cuda':
+
+    if device.type == "cuda":
         return torch.cuda.get_device_properties(device).total_memory
-    elif device.type == 'mps':      
-        return 48 * 1024 ** 3  # Assuming 8GB for MPS, as a conservative estimate
+    elif device.type == "mps":
+        return 48 * 1024**3  # Assuming 8GB for MPS, as a conservative estimate
     else:
         return 0
 
+
 @lru_cache()
 def calculate_batch_size(device):
     available_memory = get_available_memory(device)
-    
-    if device.type == 'cpu':
+
+    if device.type == "cpu":
         return 16
-    elif device.type in ['cuda', 'mps']:
+    elif device.type in ["cuda", "mps"]:
         # Adjust these thresholds based on your model size and available memory
-        if available_memory >= 31 * 1024 ** 3:  # > 32GB
+        if available_memory >= 31 * 1024**3:  # > 32GB
             return 256
-        elif available_memory >= 15 * 1024 ** 3:  # > 16GB to 32GB
+        elif available_memory >= 15 * 1024**3:  # > 16GB to 32GB
             return 128
-        elif available_memory >= 8 * 1024 ** 3:  # 8GB to 16GB
+        elif available_memory >= 8 * 1024**3:  # 8GB to 16GB
             return 64
         else:
             return 32
     else:
-        return 16  # Default batch size   
-    
+        return 16  # Default batch size
+
+
 @lru_cache()
 def get_device():
     import torch
+
     if torch.cuda.is_available():
-        device = torch.device('cuda')
+        device = torch.device("cuda")
     elif torch.backends.mps.is_available():
-        device = torch.device('mps')
+        device = torch.device("mps")
     else:
-        device = torch.device('cpu')
-    return device   
-    
+        device = torch.device("cpu")
+    return device
+
+
 def set_model_device(model):
     device = get_device()
-    model.to(device)    
+    model.to(device)
     return model, device
 
+
 @lru_cache()
 def get_home_folder():
-    home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
+    home_folder = os.path.join(
+        os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
+    )
     os.makedirs(home_folder, exist_ok=True)
     os.makedirs(f"{home_folder}/cache", exist_ok=True)
     os.makedirs(f"{home_folder}/models", exist_ok=True)
-    return home_folder 
+    return home_folder
+
 
 @lru_cache()
 def load_bert_base_uncased():
-    from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
-    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', resume_download=None)
-    model = BertModel.from_pretrained('bert-base-uncased', resume_download=None)
+    from transformers import BertTokenizer, BertModel
+
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", resume_download=None)
+    model = BertModel.from_pretrained("bert-base-uncased", resume_download=None)
     model.eval()
     model, device = set_model_device(model)
     return tokenizer, model
 
+
 @lru_cache()
 def load_HF_embedding_model(model_name="BAAI/bge-small-en-v1.5") -> tuple:
     """Load the Hugging Face model for embedding.
-    
+
     Args:
         model_name (str, optional): The model name to load. Defaults to "BAAI/bge-small-en-v1.5".
-        
+
     Returns:
         tuple: The tokenizer and model.
     """
-    from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
+    from transformers import AutoTokenizer, AutoModel
+
     tokenizer = AutoTokenizer.from_pretrained(model_name, resume_download=None)
     model = AutoModel.from_pretrained(model_name, resume_download=None)
     model.eval()
     model, device = set_model_device(model)
     return tokenizer, model
 
+
 @lru_cache()
 def load_text_classifier():
     from transformers import AutoTokenizer, AutoModelForSequenceClassification
     from transformers import pipeline
-    import torch
 
-    tokenizer = AutoTokenizer.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
-    model = AutoModelForSequenceClassification.from_pretrained("dstefa/roberta-base_topic_classification_nyt_news")
+    tokenizer = AutoTokenizer.from_pretrained(
+        "dstefa/roberta-base_topic_classification_nyt_news"
+    )
+    model = AutoModelForSequenceClassification.from_pretrained(
+        "dstefa/roberta-base_topic_classification_nyt_news"
+    )
     model.eval()
     model, device = set_model_device(model)
     pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
     return pipe
 
+
 @lru_cache()
 def load_text_multilabel_classifier():
     from transformers import AutoModelForSequenceClassification, AutoTokenizer
-    import numpy as np
     from scipy.special import expit
     import torch
 
@@ -116,18 +132,27 @@ def load_text_multilabel_classifier():
     # else:
     #     device = torch.device("cpu")
     #     # return load_spacy_model(), torch.device("cpu")
-    
 
     MODEL = "cardiffnlp/tweet-topic-21-multi"
     tokenizer = AutoTokenizer.from_pretrained(MODEL, resume_download=None)
-    model = AutoModelForSequenceClassification.from_pretrained(MODEL, resume_download=None)
+    model = AutoModelForSequenceClassification.from_pretrained(
+        MODEL, resume_download=None
+    )
     model.eval()
     model, device = set_model_device(model)
     class_mapping = model.config.id2label
 
     def _classifier(texts, threshold=0.5, max_length=64):
-        tokens = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
-        tokens = {key: val.to(device) for key, val in tokens.items()}  # Move tokens to the selected device
+        tokens = tokenizer(
+            texts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_length,
+        )
+        tokens = {
+            key: val.to(device) for key, val in tokens.items()
+        }  # Move tokens to the selected device
 
         with torch.no_grad():
             output = model(**tokens)
@@ -138,35 +163,41 @@ def load_text_multilabel_classifier():
 
         batch_labels = []
         for prediction in predictions:
-            labels = [class_mapping[i] for i, value in enumerate(prediction) if value == 1]
+            labels = [
+                class_mapping[i] for i, value in enumerate(prediction) if value == 1
+            ]
             batch_labels.append(labels)
 
         return batch_labels
 
     return _classifier, device
 
+
 @lru_cache()
 def load_nltk_punkt():
     import nltk
+
     try:
-        nltk.data.find('tokenizers/punkt')
+        nltk.data.find("tokenizers/punkt")
     except LookupError:
-        nltk.download('punkt')
-    return nltk.data.find('tokenizers/punkt')
+        nltk.download("punkt")
+    return nltk.data.find("tokenizers/punkt")
+
 
 @lru_cache()
 def load_spacy_model():
     import spacy
+
     name = "models/reuters"
     home_folder = get_home_folder()
     model_folder = Path(home_folder) / name
-    
+
     # Check if the model directory already exists
     if not (model_folder.exists() and any(model_folder.iterdir())):
         repo_url = "https://github.com/unclecode/crawl4ai.git"
-        branch = MODEL_REPO_BRANCH 
+        branch = MODEL_REPO_BRANCH
         repo_folder = Path(home_folder) / "crawl4ai"
-        
+
         print("[LOG] ⏬ Downloading Spacy model for the first time...")
 
         # Remove existing repo folder if it exists
@@ -176,7 +207,9 @@ def load_spacy_model():
                 if model_folder.exists():
                     shutil.rmtree(model_folder)
             except PermissionError:
-                print("[WARNING] Unable to remove existing folders. Please manually delete the following folders and try again:")
+                print(
+                    "[WARNING] Unable to remove existing folders. Please manually delete the following folders and try again:"
+                )
                 print(f"- {repo_folder}")
                 print(f"- {model_folder}")
                 return None
@@ -187,7 +220,7 @@ def load_spacy_model():
                 ["git", "clone", "-b", branch, repo_url, str(repo_folder)],
                 stdout=subprocess.DEVNULL,
                 stderr=subprocess.DEVNULL,
-                check=True
+                check=True,
             )
 
             # Create the models directory if it doesn't exist
@@ -215,6 +248,7 @@ def load_spacy_model():
         print(f"Error loading spacy model: {e}")
         return None
 
+
 def download_all_models(remove_existing=False):
     """Download all models required for Crawl4AI."""
     if remove_existing:
@@ -243,14 +277,20 @@ def download_all_models(remove_existing=False):
     load_nltk_punkt()
     print("[LOG] ✅ All models downloaded successfully.")
 
+
 def main():
     print("[LOG] Welcome to the Crawl4AI Model Downloader!")
     print("[LOG] This script will download all the models required for Crawl4AI.")
     parser = argparse.ArgumentParser(description="Crawl4AI Model Downloader")
-    parser.add_argument('--remove-existing', action='store_true', help="Remove existing models before downloading")
+    parser.add_argument(
+        "--remove-existing",
+        action="store_true",
+        help="Remove existing models before downloading",
+    )
     args = parser.parse_args()
-    
+
     download_all_models(remove_existing=args.remove_existing)
 
+
 if __name__ == "__main__":
     main()
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index 6fb362a3..81e08b0c 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,21 +1,83 @@
 from pydantic import BaseModel, HttpUrl
 from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
+from enum import Enum
 from dataclasses import dataclass
 from .ssl_certificate import SSLCertificate
+from datetime import datetime
+from datetime import timedelta
 
+
+###############################
+# Dispatcher Models
+###############################
+@dataclass
+class DomainState:
+    last_request_time: float = 0
+    current_delay: float = 0
+    fail_count: int = 0
+
+
+@dataclass
+class CrawlerTaskResult:
+    task_id: str
+    url: str
+    result: "CrawlResult"
+    memory_usage: float
+    peak_memory: float
+    start_time: datetime
+    end_time: datetime
+    error_message: str = ""
+
+
+class CrawlStatus(Enum):
+    QUEUED = "QUEUED"
+    IN_PROGRESS = "IN_PROGRESS"
+    COMPLETED = "COMPLETED"
+    FAILED = "FAILED"
+
+
+@dataclass
+class CrawlStats:
+    task_id: str
+    url: str
+    status: CrawlStatus
+    start_time: Optional[datetime] = None
+    end_time: Optional[datetime] = None
+    memory_usage: float = 0.0
+    peak_memory: float = 0.0
+    error_message: str = ""
+
+    @property
+    def duration(self) -> str:
+        if not self.start_time:
+            return "0:00"
+        end = self.end_time or datetime.now()
+        duration = end - self.start_time
+        return str(timedelta(seconds=int(duration.total_seconds())))
+
+
+class DisplayMode(Enum):
+    DETAILED = "DETAILED"
+    AGGREGATED = "AGGREGATED"
+
+
+###############################
+# Crawler Models
+###############################
 @dataclass
 class TokenUsage:
     completion_tokens: int = 0
-    prompt_tokens: int = 0 
+    prompt_tokens: int = 0
     total_tokens: int = 0
     completion_tokens_details: Optional[dict] = None
     prompt_tokens_details: Optional[dict] = None
-    
+
 
 class UrlModel(BaseModel):
     url: HttpUrl
     forced: bool = False
 
+
 class MarkdownGenerationResult(BaseModel):
     raw_markdown: str
     markdown_with_citations: str
@@ -23,6 +85,16 @@ class MarkdownGenerationResult(BaseModel):
     fit_markdown: Optional[str] = None
     fit_html: Optional[str] = None
 
+
+class DispatchResult(BaseModel):
+    task_id: str
+    memory_usage: float
+    peak_memory: float
+    start_time: datetime
+    end_time: datetime
+    error_message: str = ""
+
+
 class CrawlResult(BaseModel):
     url: str
     html: str
@@ -32,7 +104,7 @@ class CrawlResult(BaseModel):
     links: Dict[str, List[Dict]] = {}
     downloaded_files: Optional[List[str]] = None
     screenshot: Optional[str] = None
-    pdf : Optional[bytes] = None
+    pdf: Optional[bytes] = None
     markdown: Optional[Union[str, MarkdownGenerationResult]] = None
     markdown_v2: Optional[MarkdownGenerationResult] = None
     fit_markdown: Optional[str] = None
@@ -44,9 +116,13 @@ class CrawlResult(BaseModel):
     response_headers: Optional[dict] = None
     status_code: Optional[int] = None
     ssl_certificate: Optional[SSLCertificate] = None
+    dispatch_result: Optional[DispatchResult] = None
+    redirected_url: Optional[str] = None
+
     class Config:
         arbitrary_types_allowed = True
 
+
 class AsyncCrawlResponse(BaseModel):
     html: str
     response_headers: Dict[str, str]
@@ -56,6 +132,51 @@ class AsyncCrawlResponse(BaseModel):
     get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
     downloaded_files: Optional[List[str]] = None
     ssl_certificate: Optional[SSLCertificate] = None
+    final_url: Optional[str] = None
 
     class Config:
         arbitrary_types_allowed = True
+
+
+###############################
+# Scraping Models
+###############################
+class MediaItem(BaseModel):
+    src: Optional[str] = ""
+    alt: Optional[str] = ""
+    desc: Optional[str] = ""
+    score: Optional[int] = 0
+    type: str = "image"
+    group_id: Optional[int] = 0
+    format: Optional[str] = None
+    width: Optional[int] = None
+
+
+class Link(BaseModel):
+    href: Optional[str] = ""
+    text: Optional[str] = ""
+    title: Optional[str] = ""
+    base_domain: Optional[str] = ""
+
+
+class Media(BaseModel):
+    images: List[MediaItem] = []
+    videos: List[
+        MediaItem
+    ] = []  # Using MediaItem model for now, can be extended with Video model if needed
+    audios: List[
+        MediaItem
+    ] = []  # Using MediaItem model for now, can be extended with Audio model if needed
+
+
+class Links(BaseModel):
+    internal: List[Link] = []
+    external: List[Link] = []
+
+
+class ScrapingResult(BaseModel):
+    cleaned_html: str
+    success: bool
+    media: Media = Media()
+    links: Links = Links()
+    metadata: Dict[str, Any] = {}
diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py
index 7a963e6d..be5e0310 100644
--- a/crawl4ai/prompts.py
+++ b/crawl4ai/prompts.py
@@ -202,3 +202,808 @@ Avoid Common Mistakes:
 
 Result
 Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
+
+
+PROMPT_FILTER_CONTENT = """Your task is to filter and convert HTML content into clean, focused markdown that's optimized for use with LLMs and information retrieval systems.
+
+INPUT HTML: 
+<|HTML_CONTENT_START|>
+{HTML}
+<|HTML_CONTENT_END|>
+
+
+SPECIFIC INSTRUCTION: 
+<|USER_INSTRUCTION_START|>
+{REQUEST}
+<|USER_INSTRUCTION_END|>
+
+TASK DETAILS:
+1. Content Selection
+- DO: Keep essential information, main content, key details
+- DO: Preserve hierarchical structure using markdown headers
+- DO: Keep code blocks, tables, key lists
+- DON'T: Include navigation menus, ads, footers, cookie notices
+- DON'T: Keep social media widgets, sidebars, related content
+
+2. Content Transformation
+- DO: Use proper markdown syntax (#, ##, **, `, etc)
+- DO: Convert tables to markdown tables
+- DO: Preserve code formatting with ```language blocks
+- DO: Maintain link texts but remove tracking parameters
+- DON'T: Include HTML tags in output
+- DON'T: Keep class names, ids, or other HTML attributes
+
+3. Content Organization
+- DO: Maintain logical flow of information
+- DO: Group related content under appropriate headers
+- DO: Use consistent header levels
+- DON'T: Fragment related content
+- DON'T: Duplicate information
+
+Example Input:
+<div class="main-content"><h1>Setup Guide</h1><p>Follow these steps...</p></div>
+<div class="sidebar">Related articles...</div>
+
+Example Output:
+# Setup Guide
+Follow these steps...
+
+IMPORTANT: If specific instruction is provided above, prioritize those requirements over these general guidelines.
+
+OUTPUT FORMAT: 
+Wrap your response in <content> tags. Use proper markdown throughout.
+<content>
+[Your markdown content here]
+</content>
+
+Begin filtering now."""
+
+JSON_SCHEMA_BUILDER= """
+# HTML Schema Generation Instructions
+You are a specialized model designed to analyze HTML patterns and generate extraction schemas. Your primary job is to create structured JSON schemas that can be used to extract data from HTML in a consistent and reliable way. When presented with HTML content, you must analyze its structure and generate a schema that captures all relevant data points.
+
+## Your Core Responsibilities:
+1. Analyze HTML structure to identify repeating patterns and important data points
+2. Generate valid JSON schemas following the specified format
+3. Create appropriate selectors that will work reliably for data extraction
+4. Name fields meaningfully based on their content and purpose
+5. Handle both specific user requests and autonomous pattern detection
+
+## Available Schema Types You Can Generate:
+
+<schema_types>
+1. Basic Single-Level Schema
+   - Use for simple, flat data structures
+   - Example: Product cards, user profiles
+   - Direct field extractions
+
+2. Nested Object Schema
+   - Use for hierarchical data
+   - Example: Articles with author details
+   - Contains objects within objects
+
+3. List Schema
+   - Use for repeating elements
+   - Example: Comment sections, product lists
+   - Handles arrays of similar items
+
+4. Complex Nested Lists
+   - Use for multi-level data
+   - Example: Categories with subcategories
+   - Multiple levels of nesting
+
+5. Transformation Schema
+   - Use for data requiring processing
+   - Supports regex and text transformations
+   - Special attribute handling
+</schema_types>
+
+<schema_structure>
+Your output must always be a JSON object with this structure:
+{
+  "name": "Descriptive name of the pattern",
+  "baseSelector": "CSS selector for the repeating element",
+  "fields": [
+    {
+      "name": "field_name",
+      "selector": "CSS selector",
+      "type": "text|attribute|nested|list|regex",
+      "attribute": "attribute_name",  // Optional
+      "transform": "transformation_type",  // Optional
+      "pattern": "regex_pattern",  // Optional
+      "fields": []  // For nested/list types
+    }
+  ]
+}
+</schema_structure>
+
+<type_definitions>
+Available field types:
+- text: Direct text extraction
+- attribute: HTML attribute extraction
+- nested: Object containing other fields
+- list: Array of similar items
+- regex: Pattern-based extraction
+</type_definitions>
+
+<behavior_rules>
+1. When given a specific query:
+   - Focus on extracting requested data points
+   - Use most specific selectors possible
+   - Include all fields mentioned in the query
+
+2. When no query is provided:
+   - Identify main content areas
+   - Extract all meaningful data points
+   - Use semantic structure to determine importance
+   - Include prices, dates, titles, and other common data types
+
+3. Always:
+   - Use reliable CSS selectors
+   - Handle dynamic class names appropriately
+   - Create descriptive field names
+   - Follow consistent naming conventions
+</behavior_rules>
+
+<examples>
+1. Basic Product Card Example:
+<html>
+<div class="product-card" data-cat-id="electronics" data-subcat-id="laptops">
+  <h2 class="product-title">Gaming Laptop</h2>
+  <span class="price">$999.99</span>
+  <img src="laptop.jpg" alt="Gaming Laptop">
+</div>
+</html>
+
+Generated Schema:
+{
+  "name": "Product Cards",
+  "baseSelector": ".product-card",
+  "baseFields": [
+    {"name": "data_cat_id", "type": "attribute", "attribute": "data-cat-id"},
+    {"name": "data_subcat_id", "type": "attribute", "attribute": "data-subcat-id"}
+  ],
+  "fields": [
+    {
+      "name": "title",
+      "selector": ".product-title",
+      "type": "text"
+    },
+    {
+      "name": "price",
+      "selector": ".price",
+      "type": "text"
+    },
+    {
+      "name": "image_url",
+      "selector": "img",
+      "type": "attribute",
+      "attribute": "src"
+    }
+  ]
+}
+
+2. Article with Author Details Example:
+<html>
+<article>
+  <h1>The Future of AI</h1>
+  <div class="author-info">
+    <span class="author-name">Dr. Smith</span>
+    <img src="author.jpg" alt="Dr. Smith">
+  </div>
+</article>
+</html>
+
+Generated Schema:
+{
+  "name": "Article Details",
+  "baseSelector": "article",
+  "fields": [
+    {
+      "name": "title",
+      "selector": "h1",
+      "type": "text"
+    },
+    {
+      "name": "author",
+      "type": "nested",
+      "selector": ".author-info",
+      "fields": [
+        {
+          "name": "name",
+          "selector": ".author-name",
+          "type": "text"
+        },
+        {
+          "name": "avatar",
+          "selector": "img",
+          "type": "attribute",
+          "attribute": "src"
+        }
+      ]
+    }
+  ]
+}
+
+3. Comments Section Example:
+<html>
+<div class="comments-container">
+  <div class="comment" data-user-id="123">
+    <div class="user-name">John123</div>
+    <p class="comment-text">Great article!</p>
+  </div>
+  <div class="comment" data-user-id="456">
+    <div class="user-name">Alice456</div>
+    <p class="comment-text">Thanks for sharing.</p>
+  </div>
+</div>
+</html>
+
+Generated Schema:
+{
+  "name": "Comment Section",
+  "baseSelector": ".comments-container",
+  "baseFields": [
+    {"name": "data_user_id", "type": "attribute", "attribute": "data-user-id"}
+  ],
+  "fields": [
+    {
+      "name": "comments",
+      "type": "list",
+      "selector": ".comment",
+      "fields": [
+        {
+          "name": "user",
+          "selector": ".user-name",
+          "type": "text"
+        },
+        {
+          "name": "content",
+          "selector": ".comment-text",
+          "type": "text"
+        }
+      ]
+    }
+  ]
+}
+
+4. E-commerce Categories Example:
+<html>
+<div class="category-section" data-category="electronics">
+  <h2>Electronics</h2>
+  <div class="subcategory">
+    <h3>Laptops</h3>
+    <div class="product">
+      <span class="product-name">MacBook Pro</span>
+      <span class="price">$1299</span>
+    </div>
+    <div class="product">
+      <span class="product-name">Dell XPS</span>
+      <span class="price">$999</span>
+    </div>
+  </div>
+</div>
+</html>
+
+Generated Schema:
+{
+  "name": "E-commerce Categories",
+  "baseSelector": ".category-section",
+  "baseFields": [
+    {"name": "data_category", "type": "attribute", "attribute": "data-category"}
+  ],
+  "fields": [
+    {
+      "name": "category_name",
+      "selector": "h2",
+      "type": "text"
+    },
+    {
+      "name": "subcategories",
+      "type": "nested_list",
+      "selector": ".subcategory",
+      "fields": [
+        {
+          "name": "name",
+          "selector": "h3",
+          "type": "text"
+        },
+        {
+          "name": "products",
+          "type": "list",
+          "selector": ".product",
+          "fields": [
+            {
+              "name": "name",
+              "selector": ".product-name",
+              "type": "text"
+            },
+            {
+              "name": "price",
+              "selector": ".price",
+              "type": "text"
+            }
+          ]
+        }
+      ]
+    }
+  ]
+}
+
+5. Job Listings with Transformations Example:
+<html>
+<div class="job-post">
+  <h3 class="job-title">Senior Developer</h3>
+  <span class="salary-text">Salary: $120,000/year</span>
+  <span class="location">  New York, NY  </span>
+</div>
+</html>
+
+Generated Schema:
+{
+  "name": "Job Listings",
+  "baseSelector": ".job-post",
+  "fields": [
+    {
+      "name": "title",
+      "selector": ".job-title",
+      "type": "text",
+      "transform": "uppercase"
+    },
+    {
+      "name": "salary",
+      "selector": ".salary-text",
+      "type": "regex",
+      "pattern": "\\$([\\d,]+)"
+    },
+    {
+      "name": "location",
+      "selector": ".location",
+      "type": "text",
+      "transform": "strip"
+    }
+  ]
+}
+
+6. Skyscanner Place Card Example:
+<html>
+<div class="PlaceCard_descriptionContainer__M2NjN" data-testid="description-container">
+  <div class="PlaceCard_nameContainer__ZjZmY" tabindex="0" role="link">
+    <div class="PlaceCard_nameContent__ODUwZ">
+      <span class="BpkText_bpk-text__MjhhY BpkText_bpk-text--heading-4__Y2FlY">Doha</span>
+    </div>
+    <span class="BpkText_bpk-text__MjhhY BpkText_bpk-text--heading-4__Y2FlY PlaceCard_subName__NTVkY">Qatar</span>
+  </div>
+  <span class="PlaceCard_advertLabel__YTM0N">Sunny days and the warmest welcome awaits</span>
+  <a class="BpkLink_bpk-link__MmQwY PlaceCard_descriptionLink__NzYwN" href="/flights/del/doha/" data-testid="flights-link">
+    <div class="PriceDescription_container__NjEzM">
+      <span class="BpkText_bpk-text--heading-5__MTRjZ">₹17,559</span>
+    </div>
+  </a>
+</div>
+</html>
+
+Generated Schema:
+{
+  "name": "Skyscanner Place Cards",
+  "baseSelector": "div[class^='PlaceCard_descriptionContainer__']",
+  "baseFields": [
+    {"name": "data_testid", "type": "attribute", "attribute": "data-testid"}
+  ],
+  "fields": [
+    {
+      "name": "city_name",
+      "selector": "div[class^='PlaceCard_nameContent__'] .BpkText_bpk-text--heading-4__",
+      "type": "text"
+    },
+    {
+      "name": "country_name",
+      "selector": "span[class*='PlaceCard_subName__']",
+      "type": "text"
+    },
+    {
+      "name": "description",
+      "selector": "span[class*='PlaceCard_advertLabel__']",
+      "type": "text"
+    },
+    {
+      "name": "flight_price",
+      "selector": "a[data-testid='flights-link'] .BpkText_bpk-text--heading-5__",
+      "type": "text"
+    },
+    {
+      "name": "flight_url",
+      "selector": "a[data-testid='flights-link']",
+      "type": "attribute",
+      "attribute": "href"
+    }
+  ]
+}
+</examples>
+
+
+<output_requirements>
+Your output must:
+1. Be valid JSON only
+2. Include no explanatory text
+3. Follow the exact schema structure provided
+4. Use appropriate field types
+5. Include all required fields
+6. Use valid CSS selectors
+</output_requirements>
+
+"""
+
+JSON_SCHEMA_BUILDER_XPATH = """
+# HTML Schema Generation Instructions
+You are a specialized model designed to analyze HTML patterns and generate extraction schemas. Your primary job is to create structured JSON schemas that can be used to extract data from HTML in a consistent and reliable way. When presented with HTML content, you must analyze its structure and generate a schema that captures all relevant data points.
+
+## Your Core Responsibilities:
+1. Analyze HTML structure to identify repeating patterns and important data points
+2. Generate valid JSON schemas following the specified format
+3. Create appropriate XPath selectors that will work reliably for data extraction
+4. Name fields meaningfully based on their content and purpose
+5. Handle both specific user requests and autonomous pattern detection
+
+## Available Schema Types You Can Generate:
+
+<schema_types>
+1. Basic Single-Level Schema
+  - Use for simple, flat data structures
+  - Example: Product cards, user profiles
+  - Direct field extractions
+
+2. Nested Object Schema
+  - Use for hierarchical data
+  - Example: Articles with author details
+  - Contains objects within objects
+
+3. List Schema
+  - Use for repeating elements
+  - Example: Comment sections, product lists
+  - Handles arrays of similar items
+
+4. Complex Nested Lists
+  - Use for multi-level data
+  - Example: Categories with subcategories
+  - Multiple levels of nesting
+
+5. Transformation Schema
+  - Use for data requiring processing
+  - Supports regex and text transformations
+  - Special attribute handling
+</schema_types>
+
+<schema_structure>
+Your output must always be a JSON object with this structure:
+{
+ "name": "Descriptive name of the pattern",
+ "baseSelector": "XPath selector for the repeating element",
+ "fields": [
+   {
+     "name": "field_name",
+     "selector": "XPath selector",
+     "type": "text|attribute|nested|list|regex",
+     "attribute": "attribute_name",  // Optional
+     "transform": "transformation_type",  // Optional
+     "pattern": "regex_pattern",  // Optional
+     "fields": []  // For nested/list types
+   }
+ ]
+}
+</schema_structure>
+
+<type_definitions>
+Available field types:
+- text: Direct text extraction
+- attribute: HTML attribute extraction
+- nested: Object containing other fields
+- list: Array of similar items
+- regex: Pattern-based extraction
+</type_definitions>
+
+<behavior_rules>
+1. When given a specific query:
+  - Focus on extracting requested data points
+  - Use most specific selectors possible
+  - Include all fields mentioned in the query
+
+2. When no query is provided:
+  - Identify main content areas
+  - Extract all meaningful data points
+  - Use semantic structure to determine importance
+  - Include prices, dates, titles, and other common data types
+
+3. Always:
+  - Use reliable XPath selectors
+  - Handle dynamic element IDs appropriately
+  - Create descriptive field names
+  - Follow consistent naming conventions
+</behavior_rules>
+
+<examples>
+1. Basic Product Card Example:
+<html>
+<div class="product-card" data-cat-id="electronics" data-subcat-id="laptops">
+ <h2 class="product-title">Gaming Laptop</h2>
+ <span class="price">$999.99</span>
+ <img src="laptop.jpg" alt="Gaming Laptop">
+</div>
+</html>
+
+Generated Schema:
+{
+ "name": "Product Cards",
+ "baseSelector": "//div[@class='product-card']",
+ "baseFields": [
+   {"name": "data_cat_id", "type": "attribute", "attribute": "data-cat-id"},
+   {"name": "data_subcat_id", "type": "attribute", "attribute": "data-subcat-id"}
+ ],
+ "fields": [
+   {
+     "name": "title",
+     "selector": ".//h2[@class='product-title']",
+     "type": "text"
+   },
+   {
+     "name": "price",
+     "selector": ".//span[@class='price']",
+     "type": "text"
+   },
+   {
+     "name": "image_url",
+     "selector": ".//img",
+     "type": "attribute",
+     "attribute": "src"
+   }
+ ]
+}
+
+2. Article with Author Details Example:
+<html>
+<article>
+ <h1>The Future of AI</h1>
+ <div class="author-info">
+   <span class="author-name">Dr. Smith</span>
+   <img src="author.jpg" alt="Dr. Smith">
+ </div>
+</article>
+</html>
+
+Generated Schema:
+{
+ "name": "Article Details",
+ "baseSelector": "//article",
+ "fields": [
+   {
+     "name": "title",
+     "selector": ".//h1",
+     "type": "text"
+   },
+   {
+     "name": "author",
+     "type": "nested",
+     "selector": ".//div[@class='author-info']",
+     "fields": [
+       {
+         "name": "name",
+         "selector": ".//span[@class='author-name']",
+         "type": "text"
+       },
+       {
+         "name": "avatar",
+         "selector": ".//img",
+         "type": "attribute",
+         "attribute": "src"
+       }
+     ]
+   }
+ ]
+}
+
+3. Comments Section Example:
+<html>
+<div class="comments-container">
+ <div class="comment" data-user-id="123">
+   <div class="user-name">John123</div>
+   <p class="comment-text">Great article!</p>
+ </div>
+ <div class="comment" data-user-id="456">
+   <div class="user-name">Alice456</div>
+   <p class="comment-text">Thanks for sharing.</p>
+ </div>
+</div>
+</html>
+
+Generated Schema:
+{
+ "name": "Comment Section",
+ "baseSelector": "//div[@class='comments-container']",
+ "fields": [
+   {
+     "name": "comments",
+     "type": "list",
+     "selector": ".//div[@class='comment']",
+     "baseFields": [
+       {"name": "data_user_id", "type": "attribute", "attribute": "data-user-id"}
+     ],
+     "fields": [
+       {
+         "name": "user",
+         "selector": ".//div[@class='user-name']",
+         "type": "text"
+       },
+       {
+         "name": "content",
+         "selector": ".//p[@class='comment-text']",
+         "type": "text"
+       }
+     ]
+   }
+ ]
+}
+
+4. E-commerce Categories Example:
+<html>
+<div class="category-section" data-category="electronics">
+ <h2>Electronics</h2>
+ <div class="subcategory">
+   <h3>Laptops</h3>
+   <div class="product">
+     <span class="product-name">MacBook Pro</span>
+     <span class="price">$1299</span>
+   </div>
+   <div class="product">
+     <span class="product-name">Dell XPS</span>
+     <span class="price">$999</span>
+   </div>
+ </div>
+</div>
+</html>
+
+Generated Schema:
+{
+ "name": "E-commerce Categories",
+ "baseSelector": "//div[@class='category-section']",
+ "baseFields": [
+   {"name": "data_category", "type": "attribute", "attribute": "data-category"}
+ ],
+ "fields": [
+   {
+     "name": "category_name",
+     "selector": ".//h2",
+     "type": "text"
+   },
+   {
+     "name": "subcategories",
+     "type": "nested_list",
+     "selector": ".//div[@class='subcategory']",
+     "fields": [
+       {
+         "name": "name",
+         "selector": ".//h3",
+         "type": "text"
+       },
+       {
+         "name": "products",
+         "type": "list",
+         "selector": ".//div[@class='product']",
+         "fields": [
+           {
+             "name": "name",
+             "selector": ".//span[@class='product-name']",
+             "type": "text"
+           },
+           {
+             "name": "price",
+             "selector": ".//span[@class='price']",
+             "type": "text"
+           }
+         ]
+       }
+     ]
+   }
+ ]
+}
+
+5. Job Listings with Transformations Example:
+<html>
+<div class="job-post">
+ <h3 class="job-title">Senior Developer</h3>
+ <span class="salary-text">Salary: $120,000/year</span>
+ <span class="location">  New York, NY  </span>
+</div>
+</html>
+
+Generated Schema:
+{
+ "name": "Job Listings",
+ "baseSelector": "//div[@class='job-post']",
+ "fields": [
+   {
+     "name": "title",
+     "selector": ".//h3[@class='job-title']",
+     "type": "text",
+     "transform": "uppercase"
+   },
+   {
+     "name": "salary",
+     "selector": ".//span[@class='salary-text']",
+     "type": "regex",
+     "pattern": "\\$([\\d,]+)"
+   },
+   {
+     "name": "location",
+     "selector": ".//span[@class='location']",
+     "type": "text",
+     "transform": "strip"
+   }
+ ]
+}
+
+6. Skyscanner Place Card Example:
+<html>
+<div class="PlaceCard_descriptionContainer__M2NjN" data-testid="description-container">
+ <div class="PlaceCard_nameContainer__ZjZmY" tabindex="0" role="link">
+   <div class="PlaceCard_nameContent__ODUwZ">
+     <span class="BpkText_bpk-text__MjhhY BpkText_bpk-text--heading-4__Y2FlY">Doha</span>
+   </div>
+   <span class="BpkText_bpk-text__MjhhY BpkText_bpk-text--heading-4__Y2FlY PlaceCard_subName__NTVkY">Qatar</span>
+ </div>
+ <span class="PlaceCard_advertLabel__YTM0N">Sunny days and the warmest welcome awaits</span>
+ <a class="BpkLink_bpk-link__MmQwY PlaceCard_descriptionLink__NzYwN" href="/flights/del/doha/" data-testid="flights-link">
+   <div class="PriceDescription_container__NjEzM">
+     <span class="BpkText_bpk-text--heading-5__MTRjZ">₹17,559</span>
+   </div>
+ </a>
+</div>
+</html>
+
+Generated Schema:
+{
+ "name": "Skyscanner Place Cards",
+ "baseSelector": "//div[contains(@class, 'PlaceCard_descriptionContainer__')]",
+ "baseFields": [
+   {"name": "data_testid", "type": "attribute", "attribute": "data-testid"}
+ ],
+ "fields": [
+   {
+     "name": "city_name",
+     "selector": ".//div[contains(@class, 'PlaceCard_nameContent__')]//span[contains(@class, 'BpkText_bpk-text--heading-4__')]",
+     "type": "text"
+   },
+   {
+     "name": "country_name",
+     "selector": ".//span[contains(@class, 'PlaceCard_subName__')]",
+     "type": "text"
+   },
+   {
+     "name": "description",
+     "selector": ".//span[contains(@class, 'PlaceCard_advertLabel__')]",
+     "type": "text"
+   },
+   {
+     "name": "flight_price",
+     "selector": ".//a[@data-testid='flights-link']//span[contains(@class, 'BpkText_bpk-text--heading-5__')]",
+     "type": "text"
+   },
+   {
+     "name": "flight_url",
+     "selector": ".//a[@data-testid='flights-link']",
+     "type": "attribute",
+     "attribute": "href"
+   }
+ ]
+}
+</examples>
+
+<output_requirements>
+Your output must:
+1. Be valid JSON only
+2. Include no explanatory text
+3. Follow the exact schema structure provided
+4. Use appropriate field types
+5. Include all required fields
+6. Use valid XPath selectors
+</output_requirements>
+"""
\ No newline at end of file
diff --git a/crawl4ai/ssl_certificate.py b/crawl4ai/ssl_certificate.py
index 97529e3e..722bb7f9 100644
--- a/crawl4ai/ssl_certificate.py
+++ b/crawl4ai/ssl_certificate.py
@@ -13,10 +13,10 @@ from pathlib import Path
 class SSLCertificate:
     """
     A class representing an SSL certificate with methods to export in various formats.
-    
+
     Attributes:
         cert_info (Dict[str, Any]): The certificate information.
-        
+
         Methods:
             from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
             from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
@@ -26,32 +26,35 @@ class SSLCertificate:
             export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
             export_as_text() -> str: Export the certificate as text format.
     """
+
     def __init__(self, cert_info: Dict[str, Any]):
         self._cert_info = self._decode_cert_data(cert_info)
 
     @staticmethod
-    def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']:
+    def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
         """
         Create SSLCertificate instance from a URL.
-        
+
         Args:
             url (str): URL of the website.
             timeout (int): Timeout for the connection (default: 10).
-        
+
         Returns:
             Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
         """
         try:
             hostname = urlparse(url).netloc
-            if ':' in hostname:
-                hostname = hostname.split(':')[0]
-                
+            if ":" in hostname:
+                hostname = hostname.split(":")[0]
+
             context = ssl.create_default_context()
             with socket.create_connection((hostname, 443), timeout=timeout) as sock:
                 with context.wrap_socket(sock, server_hostname=hostname) as ssock:
                     cert_binary = ssock.getpeercert(binary_form=True)
-                    x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_ASN1, cert_binary)
-                    
+                    x509 = OpenSSL.crypto.load_certificate(
+                        OpenSSL.crypto.FILETYPE_ASN1, cert_binary
+                    )
+
                     cert_info = {
                         "subject": dict(x509.get_subject().get_components()),
                         "issuer": dict(x509.get_issuer().get_components()),
@@ -61,32 +64,33 @@ class SSLCertificate:
                         "not_after": x509.get_notAfter(),
                         "fingerprint": x509.digest("sha256").hex(),
                         "signature_algorithm": x509.get_signature_algorithm(),
-                        "raw_cert": base64.b64encode(cert_binary)
+                        "raw_cert": base64.b64encode(cert_binary),
                     }
-                    
+
                     # Add extensions
                     extensions = []
                     for i in range(x509.get_extension_count()):
                         ext = x509.get_extension(i)
-                        extensions.append({
-                            "name": ext.get_short_name(),
-                            "value": str(ext)
-                        })
+                        extensions.append(
+                            {"name": ext.get_short_name(), "value": str(ext)}
+                        )
                     cert_info["extensions"] = extensions
-                    
+
                     return SSLCertificate(cert_info)
-                    
-        except Exception as e:
+
+        except Exception:
             return None
 
     @staticmethod
     def _decode_cert_data(data: Any) -> Any:
         """Helper method to decode bytes in certificate data."""
         if isinstance(data, bytes):
-            return data.decode('utf-8')
+            return data.decode("utf-8")
         elif isinstance(data, dict):
             return {
-                (k.decode('utf-8') if isinstance(k, bytes) else k): SSLCertificate._decode_cert_data(v)
+                (
+                    k.decode("utf-8") if isinstance(k, bytes) else k
+                ): SSLCertificate._decode_cert_data(v)
                 for k, v in data.items()
             }
         elif isinstance(data, list):
@@ -96,58 +100,57 @@ class SSLCertificate:
     def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
         """
         Export certificate as JSON.
-        
+
         Args:
             filepath (Optional[str]): Path to save the JSON file (default: None).
-        
+
         Returns:
             Optional[str]: JSON string if successful, None otherwise.
         """
         json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
         if filepath:
-            Path(filepath).write_text(json_str, encoding='utf-8')
+            Path(filepath).write_text(json_str, encoding="utf-8")
             return None
         return json_str
 
     def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
         """
         Export certificate as PEM.
-        
+
         Args:
             filepath (Optional[str]): Path to save the PEM file (default: None).
-        
+
         Returns:
             Optional[str]: PEM string if successful, None otherwise.
         """
         try:
             x509 = OpenSSL.crypto.load_certificate(
-                OpenSSL.crypto.FILETYPE_ASN1, 
-                base64.b64decode(self._cert_info['raw_cert'])
+                OpenSSL.crypto.FILETYPE_ASN1,
+                base64.b64decode(self._cert_info["raw_cert"]),
             )
             pem_data = OpenSSL.crypto.dump_certificate(
-                OpenSSL.crypto.FILETYPE_PEM, 
-                x509
-            ).decode('utf-8')
-            
+                OpenSSL.crypto.FILETYPE_PEM, x509
+            ).decode("utf-8")
+
             if filepath:
-                Path(filepath).write_text(pem_data, encoding='utf-8')
+                Path(filepath).write_text(pem_data, encoding="utf-8")
                 return None
             return pem_data
-        except Exception as e:
+        except Exception:
             return None
 
     def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
         """
         Export certificate as DER.
-        
+
         Args:
             filepath (Optional[str]): Path to save the DER file (default: None).
-        
+
         Returns:
             Optional[bytes]: DER bytes if successful, None otherwise.
         """
         try:
-            der_data = base64.b64decode(self._cert_info['raw_cert'])
+            der_data = base64.b64decode(self._cert_info["raw_cert"])
             if filepath:
                 Path(filepath).write_bytes(der_data)
                 return None
@@ -158,24 +161,24 @@ class SSLCertificate:
     @property
     def issuer(self) -> Dict[str, str]:
         """Get certificate issuer information."""
-        return self._cert_info.get('issuer', {})
+        return self._cert_info.get("issuer", {})
 
     @property
     def subject(self) -> Dict[str, str]:
         """Get certificate subject information."""
-        return self._cert_info.get('subject', {})
+        return self._cert_info.get("subject", {})
 
     @property
     def valid_from(self) -> str:
         """Get certificate validity start date."""
-        return self._cert_info.get('not_before', '')
+        return self._cert_info.get("not_before", "")
 
     @property
     def valid_until(self) -> str:
         """Get certificate validity end date."""
-        return self._cert_info.get('not_after', '')
+        return self._cert_info.get("not_after", "")
 
     @property
     def fingerprint(self) -> str:
         """Get certificate fingerprint."""
-        return self._cert_info.get('fingerprint', '')
+        return self._cert_info.get("fingerprint", "")
diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py
index 6679bb1b..4f0f42cb 100644
--- a/crawl4ai/user_agent_generator.py
+++ b/crawl4ai/user_agent_generator.py
@@ -6,7 +6,7 @@ import re
 class UserAgentGenerator:
     """
     Generate random user agents with specified constraints.
-    
+
     Attributes:
         desktop_platforms (dict): A dictionary of possible desktop platforms and their corresponding user agent strings.
         mobile_platforms (dict): A dictionary of possible mobile platforms and their corresponding user agent strings.
@@ -18,7 +18,7 @@ class UserAgentGenerator:
         safari_versions (list): A list of possible Safari browser versions.
         ios_versions (list): A list of possible iOS browser versions.
         android_versions (list): A list of possible Android browser versions.
-        
+
         Methods:
             generate_user_agent(
                 platform: Literal["desktop", "mobile"] = "desktop",
@@ -30,8 +30,9 @@ class UserAgentGenerator:
                 safari_version: Optional[str] = None,
                 ios_version: Optional[str] = None,
                 android_version: Optional[str] = None
-            ): Generates a random user agent string based on the specified parameters.    
+            ): Generates a random user agent string based on the specified parameters.
     """
+
     def __init__(self):
         # Previous platform definitions remain the same...
         self.desktop_platforms = {
@@ -47,7 +48,7 @@ class UserAgentGenerator:
                 "generic": "(X11; Linux x86_64)",
                 "ubuntu": "(X11; Ubuntu; Linux x86_64)",
                 "chrome_os": "(X11; CrOS x86_64 14541.0.0)",
-            }
+            },
         }
 
         self.mobile_platforms = {
@@ -60,26 +61,14 @@ class UserAgentGenerator:
             "ios": {
                 "iphone": "(iPhone; CPU iPhone OS 16_5 like Mac OS X)",
                 "ipad": "(iPad; CPU OS 16_5 like Mac OS X)",
-            }
+            },
         }
 
         # Browser Combinations
         self.browser_combinations = {
-            1: [
-                ["chrome"],
-                ["firefox"],
-                ["safari"],
-                ["edge"]
-            ],
-            2: [
-                ["gecko", "firefox"],
-                ["chrome", "safari"],
-                ["webkit", "safari"]
-            ],
-            3: [
-                ["chrome", "safari", "edge"],
-                ["webkit", "chrome", "safari"]
-            ]
+            1: [["chrome"], ["firefox"], ["safari"], ["edge"]],
+            2: [["gecko", "firefox"], ["chrome", "safari"], ["webkit", "safari"]],
+            3: [["chrome", "safari", "edge"], ["webkit", "chrome", "safari"]],
         }
 
         # Rendering Engines with versions
@@ -90,7 +79,7 @@ class UserAgentGenerator:
                 "Gecko/20100101",
                 "Gecko/20100101",  # Firefox usually uses this constant version
                 "Gecko/2010010",
-            ]
+            ],
         }
 
         # Browser Versions
@@ -135,25 +124,25 @@ class UserAgentGenerator:
     def get_browser_stack(self, num_browsers: int = 1) -> List[str]:
         """
         Get a valid combination of browser versions.
-        
+
         How it works:
         1. Check if the number of browsers is supported.
         2. Randomly choose a combination of browsers.
         3. Iterate through the combination and add browser versions.
         4. Return the browser stack.
-        
+
         Args:
             num_browsers: Number of browser specifications (1-3)
-            
+
         Returns:
             List[str]: A list of browser versions.
         """
         if num_browsers not in self.browser_combinations:
             raise ValueError(f"Unsupported number of browsers: {num_browsers}")
-        
+
         combination = random.choice(self.browser_combinations[num_browsers])
         browser_stack = []
-        
+
         for browser in combination:
             if browser == "chrome":
                 browser_stack.append(random.choice(self.chrome_versions))
@@ -167,18 +156,20 @@ class UserAgentGenerator:
                 browser_stack.append(random.choice(self.rendering_engines["gecko"]))
             elif browser == "webkit":
                 browser_stack.append(self.rendering_engines["chrome_webkit"])
-        
+
         return browser_stack
 
-    def generate(self, 
-                device_type: Optional[Literal['desktop', 'mobile']] = None,
-                os_type: Optional[str] = None,
-                device_brand: Optional[str] = None,
-                browser_type: Optional[Literal['chrome', 'edge', 'safari', 'firefox']] = None,
-                num_browsers: int = 3) -> str:
+    def generate(
+        self,
+        device_type: Optional[Literal["desktop", "mobile"]] = None,
+        os_type: Optional[str] = None,
+        device_brand: Optional[str] = None,
+        browser_type: Optional[Literal["chrome", "edge", "safari", "firefox"]] = None,
+        num_browsers: int = 3,
+    ) -> str:
         """
         Generate a random user agent with specified constraints.
-        
+
         Args:
             device_type: 'desktop' or 'mobile'
             os_type: 'windows', 'macos', 'linux', 'android', 'ios'
@@ -188,23 +179,23 @@ class UserAgentGenerator:
         """
         # Get platform string
         platform = self.get_random_platform(device_type, os_type, device_brand)
-        
+
         # Start with Mozilla
         components = ["Mozilla/5.0", platform]
-        
+
         # Add browser stack
         browser_stack = self.get_browser_stack(num_browsers)
-        
+
         # Add appropriate legacy token based on browser stack
         if "Firefox" in str(browser_stack):
             components.append(random.choice(self.rendering_engines["gecko"]))
         elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack):
             components.append(self.rendering_engines["chrome_webkit"])
             components.append("(KHTML, like Gecko)")
-        
+
         # Add browser versions
         components.extend(browser_stack)
-        
+
         return " ".join(components)
 
     def generate_with_client_hints(self, **kwargs) -> Tuple[str, str]:
@@ -215,16 +206,20 @@ class UserAgentGenerator:
 
     def get_random_platform(self, device_type, os_type, device_brand):
         """Helper method to get random platform based on constraints"""
-        platforms = self.desktop_platforms if device_type == 'desktop' else \
-                   self.mobile_platforms if device_type == 'mobile' else \
-                   {**self.desktop_platforms, **self.mobile_platforms}
-        
+        platforms = (
+            self.desktop_platforms
+            if device_type == "desktop"
+            else self.mobile_platforms
+            if device_type == "mobile"
+            else {**self.desktop_platforms, **self.mobile_platforms}
+        )
+
         if os_type:
             for platform_group in [self.desktop_platforms, self.mobile_platforms]:
                 if os_type in platform_group:
                     platforms = {os_type: platform_group[os_type]}
                     break
-        
+
         os_key = random.choice(list(platforms.keys()))
         if device_brand and device_brand in platforms[os_key]:
             return platforms[os_key][device_brand]
@@ -233,73 +228,72 @@ class UserAgentGenerator:
     def parse_user_agent(self, user_agent: str) -> Dict[str, str]:
         """Parse a user agent string to extract browser and version information"""
         browsers = {
-            'chrome': r'Chrome/(\d+)',
-            'edge': r'Edg/(\d+)',
-            'safari': r'Version/(\d+)',
-            'firefox': r'Firefox/(\d+)'
+            "chrome": r"Chrome/(\d+)",
+            "edge": r"Edg/(\d+)",
+            "safari": r"Version/(\d+)",
+            "firefox": r"Firefox/(\d+)",
         }
-        
+
         result = {}
         for browser, pattern in browsers.items():
             match = re.search(pattern, user_agent)
             if match:
                 result[browser] = match.group(1)
-        
+
         return result
 
     def generate_client_hints(self, user_agent: str) -> str:
         """Generate Sec-CH-UA header value based on user agent string"""
         browsers = self.parse_user_agent(user_agent)
-        
+
         # Client hints components
         hints = []
-        
+
         # Handle different browser combinations
-        if 'chrome' in browsers:
+        if "chrome" in browsers:
             hints.append(f'"Chromium";v="{browsers["chrome"]}"')
             hints.append('"Not_A Brand";v="8"')
-            
-            if 'edge' in browsers:
+
+            if "edge" in browsers:
                 hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"')
             else:
                 hints.append(f'"Google Chrome";v="{browsers["chrome"]}"')
-                
-        elif 'firefox' in browsers:
+
+        elif "firefox" in browsers:
             # Firefox doesn't typically send Sec-CH-UA
             return '""'
-            
-        elif 'safari' in browsers:
+
+        elif "safari" in browsers:
             # Safari's format for client hints
             hints.append(f'"Safari";v="{browsers["safari"]}"')
             hints.append('"Not_A Brand";v="8"')
-        
-        return ', '.join(hints)
+
+        return ", ".join(hints)
+
 
 # Example usage:
 if __name__ == "__main__":
     generator = UserAgentGenerator()
     print(generator.generate())
-    
+
     print("\nSingle browser (Chrome):")
-    print(generator.generate(num_browsers=1, browser_type='chrome'))
-    
+    print(generator.generate(num_browsers=1, browser_type="chrome"))
+
     print("\nTwo browsers (Gecko/Firefox):")
     print(generator.generate(num_browsers=2))
-    
+
     print("\nThree browsers (Chrome/Safari/Edge):")
     print(generator.generate(num_browsers=3))
-    
+
     print("\nFirefox on Linux:")
-    print(generator.generate(
-        device_type='desktop',
-        os_type='linux',
-        browser_type='firefox',
-        num_browsers=2
-    ))
-    
+    print(
+        generator.generate(
+            device_type="desktop",
+            os_type="linux",
+            browser_type="firefox",
+            num_browsers=2,
+        )
+    )
+
     print("\nChrome/Safari/Edge on Windows:")
-    print(generator.generate(
-        device_type='desktop',
-        os_type='windows',
-        num_browsers=3
-    ))
\ No newline at end of file
+    print(generator.generate(device_type="desktop", os_type="windows", num_browsers=3))
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 6fd7429f..ea1309a8 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -14,7 +14,7 @@ from typing import Dict, Any
 from urllib.parse import urljoin
 import requests
 from requests.exceptions import InvalidSchema
-from typing import Optional, Tuple, Dict, Any
+from typing import Dict, Any
 import xxhash
 from colorama import Fore, Style, init
 import textwrap
@@ -27,7 +27,14 @@ import asyncio
 class InvalidCSSSelectorError(Exception):
     pass
 
-def create_box_message(message: str, type: str = "info", width: int = 120, add_newlines: bool = True, double_line: bool = False) -> str:
+
+def create_box_message(
+    message: str,
+    type: str = "info",
+    width: int = 120,
+    add_newlines: bool = True,
+    double_line: bool = False,
+) -> str:
     """
     Create a styled message box with colored borders and formatted text.
 
@@ -53,7 +60,7 @@ def create_box_message(message: str, type: str = "info", width: int = 120, add_n
     # Define border and text colors for different types
     styles = {
         "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"),
-        "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"), 
+        "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"),
         "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"),
         "error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
     }
@@ -63,24 +70,24 @@ def create_box_message(message: str, type: str = "info", width: int = 120, add_n
     # Define box characters based on line style
     box_chars = {
         "single": ("─", "│", "┌", "┐", "└", "┘"),
-        "double": ("═", "║", "╔", "╗", "╚", "╝")
+        "double": ("═", "║", "╔", "╗", "╚", "╝"),
     }
     line_style = "double" if double_line else "single"
     h_line, v_line, tl, tr, bl, br = box_chars[line_style]
 
     # Process lines with lighter text color
     formatted_lines = []
-    raw_lines = message.split('\n')
+    raw_lines = message.split("\n")
 
     if raw_lines:
         first_line = f"{prefix} {raw_lines[0].strip()}"
-        wrapped_first = textwrap.fill(first_line, width=width-4)
-        formatted_lines.extend(wrapped_first.split('\n'))
-        
+        wrapped_first = textwrap.fill(first_line, width=width - 4)
+        formatted_lines.extend(wrapped_first.split("\n"))
+
         for line in raw_lines[1:]:
             if line.strip():
-                wrapped = textwrap.fill(f"  {line.strip()}", width=width-4)
-                formatted_lines.extend(wrapped.split('\n'))
+                wrapped = textwrap.fill(f"  {line.strip()}", width=width - 4)
+                formatted_lines.extend(wrapped.split("\n"))
             else:
                 formatted_lines.append("")
 
@@ -88,8 +95,11 @@ def create_box_message(message: str, type: str = "info", width: int = 120, add_n
     horizontal_line = h_line * (width - 1)
     box = [
         f"{border_color}{tl}{horizontal_line}{tr}",
-        *[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines],
-        f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}"
+        *[
+            f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}"
+            for line in formatted_lines
+        ],
+        f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}",
     ]
 
     result = "\n".join(box)
@@ -98,6 +108,7 @@ def create_box_message(message: str, type: str = "info", width: int = 120, add_n
 
     return result
 
+
 def calculate_semaphore_count():
     """
     Calculate the optimal semaphore count based on system resources.
@@ -111,13 +122,14 @@ def calculate_semaphore_count():
     Returns:
         int: The calculated semaphore count.
     """
-    
+
     cpu_count = os.cpu_count()
-    memory_gb = get_system_memory() / (1024 ** 3)  # Convert to GB
+    memory_gb = get_system_memory() / (1024**3)  # Convert to GB
     base_count = max(1, cpu_count // 2)
     memory_based_cap = int(memory_gb / 2)  # Assume 2GB per instance
     return min(base_count, memory_based_cap)
 
+
 def get_system_memory():
     """
     Get the total system memory in bytes.
@@ -136,30 +148,34 @@ def get_system_memory():
 
     system = platform.system()
     if system == "Linux":
-        with open('/proc/meminfo', 'r') as mem:
+        with open("/proc/meminfo", "r") as mem:
             for line in mem:
-                if line.startswith('MemTotal:'):
+                if line.startswith("MemTotal:"):
                     return int(line.split()[1]) * 1024  # Convert KB to bytes
     elif system == "Darwin":  # macOS
         import subprocess
-        output = subprocess.check_output(['sysctl', '-n', 'hw.memsize']).decode('utf-8')
+
+        output = subprocess.check_output(["sysctl", "-n", "hw.memsize"]).decode("utf-8")
         return int(output.strip())
     elif system == "Windows":
         import ctypes
+
         kernel32 = ctypes.windll.kernel32
         c_ulonglong = ctypes.c_ulonglong
+
         class MEMORYSTATUSEX(ctypes.Structure):
             _fields_ = [
-                ('dwLength', ctypes.c_ulong),
-                ('dwMemoryLoad', ctypes.c_ulong),
-                ('ullTotalPhys', c_ulonglong),
-                ('ullAvailPhys', c_ulonglong),
-                ('ullTotalPageFile', c_ulonglong),
-                ('ullAvailPageFile', c_ulonglong),
-                ('ullTotalVirtual', c_ulonglong),
-                ('ullAvailVirtual', c_ulonglong),
-                ('ullAvailExtendedVirtual', c_ulonglong),
+                ("dwLength", ctypes.c_ulong),
+                ("dwMemoryLoad", ctypes.c_ulong),
+                ("ullTotalPhys", c_ulonglong),
+                ("ullAvailPhys", c_ulonglong),
+                ("ullTotalPageFile", c_ulonglong),
+                ("ullAvailPageFile", c_ulonglong),
+                ("ullTotalVirtual", c_ulonglong),
+                ("ullAvailVirtual", c_ulonglong),
+                ("ullAvailExtendedVirtual", c_ulonglong),
             ]
+
         memoryStatus = MEMORYSTATUSEX()
         memoryStatus.dwLength = ctypes.sizeof(MEMORYSTATUSEX)
         kernel32.GlobalMemoryStatusEx(ctypes.byref(memoryStatus))
@@ -167,6 +183,7 @@ def get_system_memory():
     else:
         raise OSError("Unsupported operating system")
 
+
 def get_home_folder():
     """
     Get or create the home folder for Crawl4AI configuration and cache.
@@ -180,75 +197,136 @@ def get_home_folder():
         str: The path to the Crawl4AI home folder.
     """
 
-    home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
+    home_folder = os.path.join(
+        os.getenv(
+            "CRAWL4_AI_BASE_DIRECTORY",
+            os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()),
+        ),
+        ".crawl4ai",
+    )
     os.makedirs(home_folder, exist_ok=True)
     os.makedirs(f"{home_folder}/cache", exist_ok=True)
     os.makedirs(f"{home_folder}/models", exist_ok=True)
-    return home_folder    
+    return home_folder
+
+async def get_chromium_path(browser_type) -> str:
+    """Returns the browser executable path using playwright's browser management.
+    
+    Uses playwright's built-in browser management to get the correct browser executable
+    path regardless of platform. This ensures we're using the same browser version
+    that playwright is tested with.
+    
+    Returns:
+        str: Path to browser executable
+    Raises:
+        RuntimeError: If browser executable cannot be found
+    """        
+    browser_types = {
+        "chromium": "chromium",
+        "firefox": "firefox",
+        "webkit": "webkit"
+    }
+    
+    browser_type = browser_types.get(browser_type)
+    if not browser_type:
+        raise RuntimeError(f"Unsupported browser type: {browser_type}")
+
+    # Check if a path has already been saved for this browser type
+    home_folder = get_home_folder()
+    path_file = os.path.join(home_folder, f"{browser_type.lower()}.path")
+    if os.path.exists(path_file):
+        with open(path_file, "r") as f:
+            return f.read()
+
+    from playwright.async_api import async_playwright
+    async with async_playwright() as p:
+        browsers = {
+            'chromium': p.chromium,
+            'firefox': p.firefox, 
+            'webkit': p.webkit
+        }
+        
+        if browser_type.lower() not in browsers:
+            raise ValueError(
+                f"Invalid browser type. Must be one of: {', '.join(browsers.keys())}"
+            )
+            
+        # Save the path int the crawl4ai home folder
+        home_folder = get_home_folder()
+        browser_path = browsers[browser_type.lower()].executable_path
+        if not browser_path:
+            raise RuntimeError(f"Browser executable not found for type: {browser_type}")
+        # Save the path in a text file with browser type name
+        with open(os.path.join(home_folder, f"{browser_type.lower()}.path"), "w") as f:
+            f.write(browser_path)
+        
+        return browser_path
 
 def beautify_html(escaped_html):
     """
     Beautifies an escaped HTML string.
-    
+
     Parameters:
     escaped_html (str): A string containing escaped HTML.
-    
+
     Returns:
     str: A beautifully formatted HTML string.
     """
     # Unescape the HTML string
     unescaped_html = html.unescape(escaped_html)
-    
+
     # Use BeautifulSoup to parse and prettify the HTML
-    soup = BeautifulSoup(unescaped_html, 'html.parser')
+    soup = BeautifulSoup(unescaped_html, "html.parser")
     pretty_html = soup.prettify()
-    
+
     return pretty_html
 
+
 def split_and_parse_json_objects(json_string):
     """
     Splits a JSON string which is a list of objects and tries to parse each object.
-    
+
     Parameters:
     json_string (str): A string representation of a list of JSON objects, e.g., '[{...}, {...}, ...]'.
-    
+
     Returns:
     tuple: A tuple containing two lists:
         - First list contains all successfully parsed JSON objects.
         - Second list contains the string representations of all segments that couldn't be parsed.
     """
     # Trim the leading '[' and trailing ']'
-    if json_string.startswith('[') and json_string.endswith(']'):
+    if json_string.startswith("[") and json_string.endswith("]"):
         json_string = json_string[1:-1].strip()
-    
+
     # Split the string into segments that look like individual JSON objects
     segments = []
     depth = 0
     start_index = 0
-    
+
     for i, char in enumerate(json_string):
-        if char == '{':
+        if char == "{":
             if depth == 0:
                 start_index = i
             depth += 1
-        elif char == '}':
+        elif char == "}":
             depth -= 1
             if depth == 0:
-                segments.append(json_string[start_index:i+1])
-    
+                segments.append(json_string[start_index : i + 1])
+
     # Try parsing each segment
     parsed_objects = []
     unparsed_segments = []
-    
+
     for segment in segments:
         try:
             obj = json.loads(segment)
             parsed_objects.append(obj)
         except json.JSONDecodeError:
             unparsed_segments.append(segment)
-    
+
     return parsed_objects, unparsed_segments
 
+
 def sanitize_html(html):
     """
     Sanitize an HTML string by escaping quotes.
@@ -263,7 +341,7 @@ def sanitize_html(html):
     Returns:
         str: The sanitized HTML string.
     """
-    
+
     # Replace all unwanted and special characters with an empty string
     sanitized_html = html
     # sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
@@ -273,21 +351,25 @@ def sanitize_html(html):
 
     return sanitized_html
 
+
 def sanitize_input_encode(text: str) -> str:
     """Sanitize input to handle potential encoding issues."""
     try:
         try:
             if not text:
-                return ''
+                return ""
             # Attempt to encode and decode as UTF-8 to handle potential encoding issues
-            return text.encode('utf-8', errors='ignore').decode('utf-8')
+            return text.encode("utf-8", errors="ignore").decode("utf-8")
         except UnicodeEncodeError as e:
-            print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
+            print(
+                f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}"
+            )
             # Fall back to ASCII if UTF-8 fails
-            return text.encode('ascii', errors='ignore').decode('ascii')
+            return text.encode("ascii", errors="ignore").decode("ascii")
     except Exception as e:
         raise ValueError(f"Error sanitizing input: {str(e)}") from e
 
+
 def escape_json_string(s):
     """
     Escapes characters in a string to be JSON safe.
@@ -299,24 +381,25 @@ def escape_json_string(s):
     str: The escaped string, safe for JSON encoding.
     """
     # Replace problematic backslash first
-    s = s.replace('\\', '\\\\')
-    
+    s = s.replace("\\", "\\\\")
+
     # Replace the double quote
     s = s.replace('"', '\\"')
-    
+
     # Escape control characters
-    s = s.replace('\b', '\\b')
-    s = s.replace('\f', '\\f')
-    s = s.replace('\n', '\\n')
-    s = s.replace('\r', '\\r')
-    s = s.replace('\t', '\\t')
-    
+    s = s.replace("\b", "\\b")
+    s = s.replace("\f", "\\f")
+    s = s.replace("\n", "\\n")
+    s = s.replace("\r", "\\r")
+    s = s.replace("\t", "\\t")
+
     # Additional problematic characters
     # Unicode control characters
-    s = re.sub(r'[\x00-\x1f\x7f-\x9f]', lambda x: '\\u{:04x}'.format(ord(x.group())), s)
-    
+    s = re.sub(r"[\x00-\x1f\x7f-\x9f]", lambda x: "\\u{:04x}".format(ord(x.group())), s)
+
     return s
 
+
 def replace_inline_tags(soup, tags, only_text=False):
     """
     Replace inline HTML tags with Markdown-style equivalents.
@@ -336,37 +419,39 @@ def replace_inline_tags(soup, tags, only_text=False):
     """
 
     tag_replacements = {
-        'b': lambda tag: f"**{tag.text}**",
-        'i': lambda tag: f"*{tag.text}*",
-        'u': lambda tag: f"__{tag.text}__",
-        'span': lambda tag: f"{tag.text}",
-        'del': lambda tag: f"~~{tag.text}~~",
-        'ins': lambda tag: f"++{tag.text}++",
-        'sub': lambda tag: f"~{tag.text}~",
-        'sup': lambda tag: f"^^{tag.text}^^",
-        'strong': lambda tag: f"**{tag.text}**",
-        'em': lambda tag: f"*{tag.text}*",
-        'code': lambda tag: f"`{tag.text}`",
-        'kbd': lambda tag: f"`{tag.text}`",
-        'var': lambda tag: f"_{tag.text}_",
-        's': lambda tag: f"~~{tag.text}~~",
-        'q': lambda tag: f'"{tag.text}"',
-        'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
-        'cite': lambda tag: f"_{tag.text}_",
-        'dfn': lambda tag: f"_{tag.text}_",
-        'time': lambda tag: f"{tag.text}",
-        'small': lambda tag: f"<small>{tag.text}</small>",
-        'mark': lambda tag: f"=={tag.text}=="
+        "b": lambda tag: f"**{tag.text}**",
+        "i": lambda tag: f"*{tag.text}*",
+        "u": lambda tag: f"__{tag.text}__",
+        "span": lambda tag: f"{tag.text}",
+        "del": lambda tag: f"~~{tag.text}~~",
+        "ins": lambda tag: f"++{tag.text}++",
+        "sub": lambda tag: f"~{tag.text}~",
+        "sup": lambda tag: f"^^{tag.text}^^",
+        "strong": lambda tag: f"**{tag.text}**",
+        "em": lambda tag: f"*{tag.text}*",
+        "code": lambda tag: f"`{tag.text}`",
+        "kbd": lambda tag: f"`{tag.text}`",
+        "var": lambda tag: f"_{tag.text}_",
+        "s": lambda tag: f"~~{tag.text}~~",
+        "q": lambda tag: f'"{tag.text}"',
+        "abbr": lambda tag: f"{tag.text} ({tag.get('title', '')})",
+        "cite": lambda tag: f"_{tag.text}_",
+        "dfn": lambda tag: f"_{tag.text}_",
+        "time": lambda tag: f"{tag.text}",
+        "small": lambda tag: f"<small>{tag.text}</small>",
+        "mark": lambda tag: f"=={tag.text}==",
     }
-    
-    replacement_data = [(tag, tag_replacements.get(tag, lambda t: t.text)) for tag in tags]
+
+    replacement_data = [
+        (tag, tag_replacements.get(tag, lambda t: t.text)) for tag in tags
+    ]
 
     for tag_name, replacement_func in replacement_data:
         for tag in soup.find_all(tag_name):
             replacement_text = tag.text if only_text else replacement_func(tag)
             tag.replace_with(replacement_text)
 
-    return soup    
+    return soup
 
     # for tag_name in tags:
     #     for tag in soup.find_all(tag_name):
@@ -378,7 +463,10 @@ def replace_inline_tags(soup, tags, only_text=False):
 
     # return soup
 
-def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs):
+
+def get_content_of_website(
+    url, html, word_count_threshold=MIN_WORD_THRESHOLD, css_selector=None, **kwargs
+):
     """
     Extract structured content, media, and links from website HTML.
 
@@ -403,120 +491,128 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
         if not html:
             return None
         # Parse HTML content with BeautifulSoup
-        soup = BeautifulSoup(html, 'html.parser')
+        soup = BeautifulSoup(html, "html.parser")
 
         # Get the content within the <body> tag
         body = soup.body
-        
+
         # If css_selector is provided, extract content based on the selector
         if css_selector:
             selected_elements = body.select(css_selector)
             if not selected_elements:
-                raise InvalidCSSSelectorError(f"Invalid CSS selector , No elements found for CSS selector: {css_selector}")
-            div_tag = soup.new_tag('div')
+                raise InvalidCSSSelectorError(
+                    f"Invalid CSS selector , No elements found for CSS selector: {css_selector}"
+                )
+            div_tag = soup.new_tag("div")
             for el in selected_elements:
                 div_tag.append(el)
             body = div_tag
-            
-        links = {
-            'internal': [],
-            'external': []
-        }
-        
+
+        links = {"internal": [], "external": []}
+
         # Extract all internal and external links
-        for a in body.find_all('a', href=True):
-            href = a['href']
-            url_base = url.split('/')[2]
-            if href.startswith('http') and url_base not in href:
-                links['external'].append({
-                    'href': href,
-                    'text': a.get_text()
-                })
+        for a in body.find_all("a", href=True):
+            href = a["href"]
+            url_base = url.split("/")[2]
+            if href.startswith("http") and url_base not in href:
+                links["external"].append({"href": href, "text": a.get_text()})
             else:
-                links['internal'].append(
-                    {
-                        'href': href,
-                        'text': a.get_text()
-                    }
-                )
+                links["internal"].append({"href": href, "text": a.get_text()})
 
         # Remove script, style, and other tags that don't carry useful content from body
-        for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
+        for tag in body.find_all(["script", "style", "link", "meta", "noscript"]):
             tag.decompose()
 
         # Remove all attributes from remaining tags in body, except for img tags
         for tag in body.find_all():
-            if tag.name != 'img':
+            if tag.name != "img":
                 tag.attrs = {}
 
         # Extract all img tgas int0 [{src: '', alt: ''}]
-        media = {
-            'images': [],
-            'videos': [],
-            'audios': []
-        }
-        for img in body.find_all('img'):
-            media['images'].append({
-                'src': img.get('src'),
-                'alt': img.get('alt'),
-                "type": "image"
-            })
-            
+        media = {"images": [], "videos": [], "audios": []}
+        for img in body.find_all("img"):
+            media["images"].append(
+                {"src": img.get("src"), "alt": img.get("alt"), "type": "image"}
+            )
+
         # Extract all video tags into [{src: '', alt: ''}]
-        for video in body.find_all('video'):
-            media['videos'].append({
-                'src': video.get('src'),
-                'alt': video.get('alt'),
-                "type": "video"
-            })
-            
+        for video in body.find_all("video"):
+            media["videos"].append(
+                {"src": video.get("src"), "alt": video.get("alt"), "type": "video"}
+            )
+
         # Extract all audio tags into [{src: '', alt: ''}]
-        for audio in body.find_all('audio'):
-            media['audios'].append({
-                'src': audio.get('src'),
-                'alt': audio.get('alt'),
-                "type": "audio"
-            })
-        
+        for audio in body.find_all("audio"):
+            media["audios"].append(
+                {"src": audio.get("src"), "alt": audio.get("alt"), "type": "audio"}
+            )
+
         # Replace images with their alt text or remove them if no alt text is available
-        for img in body.find_all('img'):
-            alt_text = img.get('alt')
+        for img in body.find_all("img"):
+            alt_text = img.get("alt")
             if alt_text:
                 img.replace_with(soup.new_string(alt_text))
             else:
                 img.decompose()
 
-
         # Create a function that replace content of all"pre" tag with its inner text
         def replace_pre_tags_with_text(node):
-            for child in node.find_all('pre'):
+            for child in node.find_all("pre"):
                 # set child inner html to its text
                 child.string = child.get_text()
             return node
-        
+
         # Replace all "pre" tags with their inner text
         body = replace_pre_tags_with_text(body)
-        
+
         # Replace inline tags with their text content
         body = replace_inline_tags(
-            body, 
-            ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'],
-            only_text=kwargs.get('only_text', False)
+            body,
+            [
+                "b",
+                "i",
+                "u",
+                "span",
+                "del",
+                "ins",
+                "sub",
+                "sup",
+                "strong",
+                "em",
+                "code",
+                "kbd",
+                "var",
+                "s",
+                "q",
+                "abbr",
+                "cite",
+                "dfn",
+                "time",
+                "small",
+                "mark",
+            ],
+            only_text=kwargs.get("only_text", False),
         )
 
         # Recursively remove empty elements, their parent elements, and elements with word count below threshold
         def remove_empty_and_low_word_count_elements(node, word_count_threshold):
             for child in node.contents:
                 if isinstance(child, element.Tag):
-                    remove_empty_and_low_word_count_elements(child, word_count_threshold)
+                    remove_empty_and_low_word_count_elements(
+                        child, word_count_threshold
+                    )
                     word_count = len(child.get_text(strip=True).split())
-                    if (len(child.contents) == 0 and not child.get_text(strip=True)) or word_count < word_count_threshold:
+                    if (
+                        len(child.contents) == 0 and not child.get_text(strip=True)
+                    ) or word_count < word_count_threshold:
                         child.decompose()
             return node
 
         body = remove_empty_and_low_word_count_elements(body, word_count_threshold)
-        
-        def remove_small_text_tags(body: Tag, word_count_threshold: int = MIN_WORD_THRESHOLD):
+
+        def remove_small_text_tags(
+            body: Tag, word_count_threshold: int = MIN_WORD_THRESHOLD
+        ):
             # We'll use a list to collect all tags that don't meet the word count requirement
             tags_to_remove = []
 
@@ -535,11 +631,10 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
                 tag.decompose()  # or tag.extract() to remove and get the element
 
             return body
-        
-    
+
         # Remove small text tags
-        body = remove_small_text_tags(body, word_count_threshold)       
-        
+        body = remove_small_text_tags(body, word_count_threshold)
+
         def is_empty_or_whitespace(tag: Tag):
             if isinstance(tag, NavigableString):
                 return not tag.strip()
@@ -554,41 +649,43 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
             while changes:
                 changes = False
                 # Collect all tags that are empty or contain only whitespace
-                empty_tags = [tag for tag in body.find_all(True) if is_empty_or_whitespace(tag)]
+                empty_tags = [
+                    tag for tag in body.find_all(True) if is_empty_or_whitespace(tag)
+                ]
                 for tag in empty_tags:
                     # If a tag is empty, decompose it
                     tag.decompose()
                     changes = True  # Mark that a change was made
 
-            return body        
+            return body
 
-        
         # Remove empty tags
         body = remove_empty_tags(body)
-        
+
         # Flatten nested elements with only one child of the same type
         def flatten_nested_elements(node):
             for child in node.contents:
                 if isinstance(child, element.Tag):
                     flatten_nested_elements(child)
-                    if len(child.contents) == 1 and child.contents[0].name == child.name:
+                    if (
+                        len(child.contents) == 1
+                        and child.contents[0].name == child.name
+                    ):
                         # print('Flattening:', child.name)
                         child_content = child.contents[0]
                         child.replace_with(child_content)
-                        
+
             return node
 
         body = flatten_nested_elements(body)
-        
-
 
         # Remove comments
-        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): 
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
             comment.extract()
 
         # Remove consecutive empty newlines and replace multiple spaces with a single space
-        cleaned_html = str(body).replace('\n\n', '\n').replace('  ', ' ')
-        
+        cleaned_html = str(body).replace("\n\n", "\n").replace("  ", " ")
+
         # Sanitize the cleaned HTML content
         cleaned_html = sanitize_html(cleaned_html)
         # sanitized_html = escape_json_string(cleaned_html)
@@ -598,81 +695,97 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
         h = CustomHTML2Text()
         h.ignore_links = True
         markdown = h.handle(cleaned_html)
-        markdown = markdown.replace('    ```', '```')
-            
+        markdown = markdown.replace("    ```", "```")
+
         try:
             meta = extract_metadata(html, soup)
         except Exception as e:
-            print('Error extracting metadata:', str(e))
+            print("Error extracting metadata:", str(e))
             meta = {}
-                
-        
+
         # Return the Markdown content
-        return{
-            'markdown': markdown,
-            'cleaned_html': cleaned_html,
-            'success': True,
-            'media': media,
-            'links': links,
-            'metadata': meta
+        return {
+            "markdown": markdown,
+            "cleaned_html": cleaned_html,
+            "success": True,
+            "media": media,
+            "links": links,
+            "metadata": meta,
         }
 
     except Exception as e:
-        print('Error processing HTML content:', str(e))
+        print("Error processing HTML content:", str(e))
         raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
 
-def get_content_of_website_optimized(url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
+
+def get_content_of_website_optimized(
+    url: str,
+    html: str,
+    word_count_threshold: int = MIN_WORD_THRESHOLD,
+    css_selector: str = None,
+    **kwargs,
+) -> Dict[str, Any]:
     if not html:
         return None
 
-    soup = BeautifulSoup(html, 'html.parser')
+    soup = BeautifulSoup(html, "html.parser")
     body = soup.body
-    
-    image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
 
-    for tag in kwargs.get('excluded_tags', []) or []:
+    image_description_min_word_threshold = kwargs.get(
+        "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
+    )
+
+    for tag in kwargs.get("excluded_tags", []) or []:
         for el in body.select(tag):
             el.decompose()
-        
+
     if css_selector:
         selected_elements = body.select(css_selector)
         if not selected_elements:
-            raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
-        body = soup.new_tag('div')
+            raise InvalidCSSSelectorError(
+                f"Invalid CSS selector, No elements found for CSS selector: {css_selector}"
+            )
+        body = soup.new_tag("div")
         for el in selected_elements:
             body.append(el)
 
-    links = {'internal': [], 'external': []}
-    media = {'images': [], 'videos': [], 'audios': []}
+    links = {"internal": [], "external": []}
+    media = {"images": [], "videos": [], "audios": []}
 
     # Extract meaningful text for media files from closest parent
     def find_closest_parent_with_useful_text(tag):
-            current_tag = tag
-            while current_tag:
-                current_tag = current_tag.parent
-                # Get the text content from the parent tag
-                if current_tag:
-                    text_content = current_tag.get_text(separator=' ',strip=True)
-                    # Check if the text content has at least word_count_threshold
-                    if len(text_content.split()) >= image_description_min_word_threshold:
-                        return text_content
-            return None
+        current_tag = tag
+        while current_tag:
+            current_tag = current_tag.parent
+            # Get the text content from the parent tag
+            if current_tag:
+                text_content = current_tag.get_text(separator=" ", strip=True)
+                # Check if the text content has at least word_count_threshold
+                if len(text_content.split()) >= image_description_min_word_threshold:
+                    return text_content
+        return None
 
     def process_image(img, url, index, total_images):
-        #Check if an image has valid display and inside undesired html elements
+        # Check if an image has valid display and inside undesired html elements
         def is_valid_image(img, parent, parent_classes):
-            style = img.get('style', '')
-            src = img.get('src', '')
-            classes_to_check = ['button', 'icon', 'logo']
-            tags_to_check = ['button', 'input']
-            return all([
-                'display:none' not in style,
-                src,
-                not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
-                parent.name not in tags_to_check
-            ])
+            style = img.get("style", "")
+            src = img.get("src", "")
+            classes_to_check = ["button", "icon", "logo"]
+            tags_to_check = ["button", "input"]
+            return all(
+                [
+                    "display:none" not in style,
+                    src,
+                    not any(
+                        s in var
+                        for var in [src, img.get("alt", ""), *parent_classes]
+                        for s in classes_to_check
+                    ),
+                    parent.name not in tags_to_check,
+                ]
+            )
 
-        #Score an image for it's usefulness
+        # Score an image for it's usefulness
         def score_image_for_usefulness(img, base_url, index, images_count):
             # Function to parse image height/width value and units
             def parse_dimension(dimension):
@@ -680,66 +793,68 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
                     match = re.match(r"(\d+)(\D*)", dimension)
                     if match:
                         number = int(match.group(1))
-                        unit = match.group(2) or 'px'  # Default unit is 'px' if not specified
+                        unit = (
+                            match.group(2) or "px"
+                        )  # Default unit is 'px' if not specified
                         return number, unit
                 return None, None
 
             # Fetch image file metadata to extract size and extension
             def fetch_image_file_size(img, base_url):
-                #If src is relative path construct full URL, if not it may be CDN URL
-                img_url = urljoin(base_url,img.get('src'))
+                # If src is relative path construct full URL, if not it may be CDN URL
+                img_url = urljoin(base_url, img.get("src"))
                 try:
                     response = requests.head(img_url)
                     if response.status_code == 200:
-                        return response.headers.get('Content-Length',None)
+                        return response.headers.get("Content-Length", None)
                     else:
                         print(f"Failed to retrieve file size for {img_url}")
                         return None
-                except InvalidSchema as e:
+                except InvalidSchema:
                     return None
                 finally:
                     return
 
-            image_height = img.get('height')
+            image_height = img.get("height")
             height_value, height_unit = parse_dimension(image_height)
-            image_width =  img.get('width')
+            image_width = img.get("width")
             width_value, width_unit = parse_dimension(image_width)
-            image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
-            image_format = os.path.splitext(img.get('src',''))[1].lower()
+            image_size = 0  # int(fetch_image_file_size(img,base_url) or 0)
+            image_format = os.path.splitext(img.get("src", ""))[1].lower()
             # Remove . from format
-            image_format = image_format.strip('.')
+            image_format = image_format.strip(".")
             score = 0
             if height_value:
-                if height_unit == 'px' and height_value > 150:
+                if height_unit == "px" and height_value > 150:
                     score += 1
-                if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
+                if height_unit in ["%", "vh", "vmin", "vmax"] and height_value > 30:
                     score += 1
             if width_value:
-                if width_unit == 'px' and width_value > 150:
+                if width_unit == "px" and width_value > 150:
                     score += 1
-                if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
+                if width_unit in ["%", "vh", "vmin", "vmax"] and width_value > 30:
                     score += 1
             if image_size > 10000:
                 score += 1
-            if img.get('alt') != '':
-                score+=1
-            if any(image_format==format for format in ['jpg','png','webp']):
-                score+=1
-            if index/images_count<0.5:
-                score+=1
+            if img.get("alt") != "":
+                score += 1
+            if any(image_format == format for format in ["jpg", "png", "webp"]):
+                score += 1
+            if index / images_count < 0.5:
+                score += 1
             return score
 
-        if not is_valid_image(img, img.parent, img.parent.get('class', [])):
+        if not is_valid_image(img, img.parent, img.parent.get("class", [])):
             return None
         score = score_image_for_usefulness(img, url, index, total_images)
         if score <= IMAGE_SCORE_THRESHOLD:
             return None
         return {
-            'src': img.get('src', '').replace('\\"', '"').strip(),
-            'alt': img.get('alt', ''),
-            'desc': find_closest_parent_with_useful_text(img),
-            'score': score,
-            'type': 'image'
+            "src": img.get("src", "").replace('\\"', '"').strip(),
+            "alt": img.get("alt", ""),
+            "desc": find_closest_parent_with_useful_text(img),
+            "score": score,
+            "type": "image",
         }
 
     def process_element(element: element.PageElement) -> bool:
@@ -749,60 +864,89 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
                     element.extract()
                 return False
 
-            if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
+            if element.name in ["script", "style", "link", "meta", "noscript"]:
                 element.decompose()
                 return False
 
             keep_element = False
 
-            if element.name == 'a' and element.get('href'):
-                href = element['href']
-                url_base = url.split('/')[2]
-                link_data = {'href': href, 'text': element.get_text()}
-                if href.startswith('http') and url_base not in href:
-                    links['external'].append(link_data)
+            if element.name == "a" and element.get("href"):
+                href = element["href"]
+                url_base = url.split("/")[2]
+                link_data = {"href": href, "text": element.get_text()}
+                if href.startswith("http") and url_base not in href:
+                    links["external"].append(link_data)
                 else:
-                    links['internal'].append(link_data)
+                    links["internal"].append(link_data)
                 keep_element = True
 
-            elif element.name == 'img':
+            elif element.name == "img":
                 return True  # Always keep image elements
 
-            elif element.name in ['video', 'audio']:
-                media[f"{element.name}s"].append({
-                    'src': element.get('src'),
-                    'alt': element.get('alt'),
-                    'type': element.name,
-                    'description': find_closest_parent_with_useful_text(element)
-                })
-                source_tags = element.find_all('source')
+            elif element.name in ["video", "audio"]:
+                media[f"{element.name}s"].append(
+                    {
+                        "src": element.get("src"),
+                        "alt": element.get("alt"),
+                        "type": element.name,
+                        "description": find_closest_parent_with_useful_text(element),
+                    }
+                )
+                source_tags = element.find_all("source")
                 for source_tag in source_tags:
-                    media[f"{element.name}s"].append({
-                    'src': source_tag.get('src'),
-                    'alt': element.get('alt'),
-                    'type': element.name,
-                    'description': find_closest_parent_with_useful_text(element)
-                })
+                    media[f"{element.name}s"].append(
+                        {
+                            "src": source_tag.get("src"),
+                            "alt": element.get("alt"),
+                            "type": element.name,
+                            "description": find_closest_parent_with_useful_text(
+                                element
+                            ),
+                        }
+                    )
                 return True  # Always keep video and audio elements
 
-            if element.name != 'pre':
-                if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
-                    if kwargs.get('only_text', False):
+            if element.name != "pre":
+                if element.name in [
+                    "b",
+                    "i",
+                    "u",
+                    "span",
+                    "del",
+                    "ins",
+                    "sub",
+                    "sup",
+                    "strong",
+                    "em",
+                    "code",
+                    "kbd",
+                    "var",
+                    "s",
+                    "q",
+                    "abbr",
+                    "cite",
+                    "dfn",
+                    "time",
+                    "small",
+                    "mark",
+                ]:
+                    if kwargs.get("only_text", False):
                         element.replace_with(element.get_text())
                     else:
                         element.unwrap()
-                elif element.name != 'img':
+                elif element.name != "img":
                     element.attrs = {}
 
             # Process children
             for child in list(element.children):
-                if isinstance(child, NavigableString) and not isinstance(child, Comment):
+                if isinstance(child, NavigableString) and not isinstance(
+                    child, Comment
+                ):
                     if len(child.strip()) > 0:
                         keep_element = True
                 else:
                     if process_element(child):
                         keep_element = True
-                
 
             # Check word count
             if not keep_element:
@@ -814,14 +958,16 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
 
             return keep_element
         except Exception as e:
-            print('Error processing element:', str(e))
+            print("Error processing element:", str(e))
             return False
 
-    #process images by filtering and extracting contextual text from the page
-    imgs = body.find_all('img')
-    media['images'] = [
-        result for result in
-        (process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs))
+    # process images by filtering and extracting contextual text from the page
+    imgs = body.find_all("img")
+    media["images"] = [
+        result
+        for result in (
+            process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs)
+        )
         if result is not None
     ]
 
@@ -830,7 +976,11 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
     def flatten_nested_elements(node):
         if isinstance(node, NavigableString):
             return node
-        if len(node.contents) == 1 and isinstance(node.contents[0], element.Tag) and node.contents[0].name == node.name:
+        if (
+            len(node.contents) == 1
+            and isinstance(node.contents[0], element.Tag)
+            and node.contents[0].name == node.name
+        ):
             return flatten_nested_elements(node.contents[0])
         node.contents = [flatten_nested_elements(child) for child in node.contents]
         return node
@@ -839,35 +989,93 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
     base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
     for img in imgs:
         try:
-            src = img.get('src', '')
+            src = img.get("src", "")
             if base64_pattern.match(src):
-                img['src'] = base64_pattern.sub('', src)
+                img["src"] = base64_pattern.sub("", src)
         except:
-            pass        
+            pass
 
-    cleaned_html = str(body).replace('\n\n', '\n').replace('  ', ' ')
+    cleaned_html = str(body).replace("\n\n", "\n").replace("  ", " ")
     cleaned_html = sanitize_html(cleaned_html)
 
     h = CustomHTML2Text()
     h.ignore_links = True
     markdown = h.handle(cleaned_html)
-    markdown = markdown.replace('    ```', '```')
+    markdown = markdown.replace("    ```", "```")
 
     try:
         meta = extract_metadata(html, soup)
     except Exception as e:
-        print('Error extracting metadata:', str(e))
+        print("Error extracting metadata:", str(e))
         meta = {}
 
     return {
-        'markdown': markdown,
-        'cleaned_html': cleaned_html,
-        'success': True,
-        'media': media,
-        'links': links,
-        'metadata': meta
+        "markdown": markdown,
+        "cleaned_html": cleaned_html,
+        "success": True,
+        "media": media,
+        "links": links,
+        "metadata": meta,
     }
 
+
+def extract_metadata_using_lxml(html, doc=None):
+    """
+    Extract metadata from HTML using lxml for better performance.
+    """
+    metadata = {}
+
+    if not html and doc is None:
+        return {}
+
+    if doc is None:
+        try:
+            doc = lhtml.document_fromstring(html)
+        except Exception:
+            return {}
+
+    # Use XPath to find head element
+    head = doc.xpath("//head")
+    if not head:
+        return metadata
+
+    head = head[0]
+
+    # Title - using XPath
+    title = head.xpath(".//title/text()")
+    metadata["title"] = title[0].strip() if title else None
+
+    # Meta description - using XPath with multiple attribute conditions
+    description = head.xpath('.//meta[@name="description"]/@content')
+    metadata["description"] = description[0].strip() if description else None
+
+    # Meta keywords
+    keywords = head.xpath('.//meta[@name="keywords"]/@content')
+    metadata["keywords"] = keywords[0].strip() if keywords else None
+
+    # Meta author
+    author = head.xpath('.//meta[@name="author"]/@content')
+    metadata["author"] = author[0].strip() if author else None
+
+    # Open Graph metadata - using starts-with() for performance
+    og_tags = head.xpath('.//meta[starts-with(@property, "og:")]')
+    for tag in og_tags:
+        property_name = tag.get("property", "").strip()
+        content = tag.get("content", "").strip()
+        if property_name and content:
+            metadata[property_name] = content
+
+    # Twitter Card metadata
+    twitter_tags = head.xpath('.//meta[starts-with(@name, "twitter:")]')
+    for tag in twitter_tags:
+        property_name = tag.get("name", "").strip()
+        content = tag.get("content", "").strip()
+        if property_name and content:
+            metadata[property_name] = content
+
+    return metadata
+
+
 def extract_metadata(html, soup=None):
     """
     Extract optimized content, media, and links from website HTML.
@@ -889,66 +1097,74 @@ def extract_metadata(html, soup=None):
     Returns:
         Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
     """
-    
+
     metadata = {}
-    
+
     if not html and not soup:
         return {}
-    
+
     if not soup:
-        soup = BeautifulSoup(html, 'lxml')
-    
+        soup = BeautifulSoup(html, "lxml")
+
     head = soup.head
     if not head:
         return metadata
-    
+
     # Title
-    title_tag = head.find('title')
-    metadata['title'] = title_tag.string.strip() if title_tag and title_tag.string else None
+    title_tag = head.find("title")
+    metadata["title"] = (
+        title_tag.string.strip() if title_tag and title_tag.string else None
+    )
 
     # Meta description
-    description_tag = head.find('meta', attrs={'name': 'description'})
-    metadata['description'] = description_tag.get('content', '').strip() if description_tag else None
+    description_tag = head.find("meta", attrs={"name": "description"})
+    metadata["description"] = (
+        description_tag.get("content", "").strip() if description_tag else None
+    )
 
     # Meta keywords
-    keywords_tag = head.find('meta', attrs={'name': 'keywords'})
-    metadata['keywords'] = keywords_tag.get('content', '').strip() if keywords_tag else None
+    keywords_tag = head.find("meta", attrs={"name": "keywords"})
+    metadata["keywords"] = (
+        keywords_tag.get("content", "").strip() if keywords_tag else None
+    )
 
     # Meta author
-    author_tag = head.find('meta', attrs={'name': 'author'})
-    metadata['author'] = author_tag.get('content', '').strip() if author_tag else None
+    author_tag = head.find("meta", attrs={"name": "author"})
+    metadata["author"] = author_tag.get("content", "").strip() if author_tag else None
 
     # Open Graph metadata
-    og_tags = head.find_all('meta', attrs={'property': re.compile(r'^og:')})
+    og_tags = head.find_all("meta", attrs={"property": re.compile(r"^og:")})
     for tag in og_tags:
-        property_name = tag.get('property', '').strip()
-        content = tag.get('content', '').strip()
+        property_name = tag.get("property", "").strip()
+        content = tag.get("content", "").strip()
         if property_name and content:
             metadata[property_name] = content
 
     # Twitter Card metadata
-    twitter_tags = head.find_all('meta', attrs={'name': re.compile(r'^twitter:')})
+    twitter_tags = head.find_all("meta", attrs={"name": re.compile(r"^twitter:")})
     for tag in twitter_tags:
-        property_name = tag.get('name', '').strip()
-        content = tag.get('content', '').strip()
+        property_name = tag.get("name", "").strip()
+        content = tag.get("content", "").strip()
         if property_name and content:
             metadata[property_name] = content
-    
+
     return metadata
 
+
 def extract_xml_tags(string):
     """
     Extracts XML tags from a string.
 
-    Args:    
+    Args:
         string (str): The input string containing XML tags.
 
     Returns:
         List[str]: A list of XML tags extracted from the input string.
     """
-    tags = re.findall(r'<(\w+)>', string)
+    tags = re.findall(r"<(\w+)>", string)
     return list(set(tags))
 
+
 def extract_xml_data(tags, string):
     """
     Extract data for specified XML tags from a string.
@@ -977,15 +1193,16 @@ def extract_xml_data(tags, string):
             data[tag] = ""
 
     return data
-    
+
+
 def perform_completion_with_backoff(
-    provider, 
-    prompt_with_variables, 
-    api_token, 
-    json_response = False, 
+    provider,
+    prompt_with_variables,
+    api_token,
+    json_response=False,
     base_url=None,
-    **kwargs
-    ):
+    **kwargs,
+):
     """
     Perform an API completion request with exponential backoff.
 
@@ -1005,52 +1222,49 @@ def perform_completion_with_backoff(
     Returns:
         dict: The API response or an error message after all retries.
     """
-    
-    from litellm import completion 
+
+    from litellm import completion
     from litellm.exceptions import RateLimitError
+
     max_attempts = 3
     base_delay = 2  # Base delay in seconds, you can adjust this based on your needs
-    
-    extra_args = {
-        "temperature": 0.01,
-        'api_key': api_token,
-        'base_url': base_url
-    }
+
+    extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
     if json_response:
-        extra_args["response_format"] = { "type": "json_object" }
-        
+        extra_args["response_format"] = {"type": "json_object"}
+
     if kwargs.get("extra_args"):
         extra_args.update(kwargs["extra_args"])
-    
+
     for attempt in range(max_attempts):
         try:
-            
-            response =completion(
+            response = completion(
                 model=provider,
-                messages=[
-                    {"role": "user", "content": prompt_with_variables}
-                ],
-                **extra_args
+                messages=[{"role": "user", "content": prompt_with_variables}],
+                **extra_args,
             )
             return response  # Return the successful response
         except RateLimitError as e:
             print("Rate limit error:", str(e))
-            
+
             # Check if we have exhausted our max attempts
             if attempt < max_attempts - 1:
                 # Calculate the delay and wait
-                delay = base_delay * (2 ** attempt)  # Exponential backoff formula
+                delay = base_delay * (2**attempt)  # Exponential backoff formula
                 print(f"Waiting for {delay} seconds before retrying...")
                 time.sleep(delay)
             else:
                 # Return an error response after exhausting all retries
-                return [{
-                    "index": 0,
-                    "tags": ["error"],
-                    "content": ["Rate limit error. Please try again later."]
-                }]
-    
-def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None):
+                return [
+                    {
+                        "index": 0,
+                        "tags": ["error"],
+                        "content": ["Rate limit error. Please try again later."],
+                    }
+                ]
+
+
+def extract_blocks(url, html, provider=DEFAULT_PROVIDER, api_token=None, base_url=None):
     """
     Extract content blocks from website HTML using an AI provider.
 
@@ -1072,7 +1286,7 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, bas
 
     # api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
     api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
-    
+
     variable_values = {
         "URL": url,
         "HTML": escape_json_string(sanitize_html(html)),
@@ -1083,29 +1297,33 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, bas
         prompt_with_variables = prompt_with_variables.replace(
             "{" + variable + "}", variable_values[variable]
         )
-        
-    response = perform_completion_with_backoff(provider, prompt_with_variables, api_token, base_url=base_url)
-        
+
+    response = perform_completion_with_backoff(
+        provider, prompt_with_variables, api_token, base_url=base_url
+    )
+
     try:
-        blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
+        blocks = extract_xml_data(["blocks"], response.choices[0].message.content)[
+            "blocks"
+        ]
         blocks = json.loads(blocks)
         ## Add error: False to the blocks
         for block in blocks:
-            block['error'] = False
-    except Exception as e:
-        parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
+            block["error"] = False
+    except Exception:
+        parsed, unparsed = split_and_parse_json_objects(
+            response.choices[0].message.content
+        )
         blocks = parsed
         # Append all unparsed segments as onr error block and content is list of unparsed segments
         if unparsed:
-            blocks.append({
-                "index": 0,
-                "error": True,
-                "tags": ["error"],
-                "content": unparsed
-            })
+            blocks.append(
+                {"index": 0, "error": True, "tags": ["error"], "content": unparsed}
+            )
     return blocks
 
-def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None):
+
+def extract_blocks_batch(batch_data, provider="groq/llama3-70b-8192", api_token=None):
     """
     Extract content blocks from a batch of website HTMLs.
 
@@ -1123,11 +1341,12 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
         List[dict]: A list of extracted content blocks from all batch items.
     """
 
-    api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
+    api_token = os.getenv("GROQ_API_KEY", None) if not api_token else api_token
     from litellm import batch_completion
+
     messages = []
-    
-    for url, html in batch_data:        
+
+    for url, html in batch_data:
         variable_values = {
             "URL": url,
             "HTML": html,
@@ -1138,33 +1357,37 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
             prompt_with_variables = prompt_with_variables.replace(
                 "{" + variable + "}", variable_values[variable]
             )
-            
+
         messages.append([{"role": "user", "content": prompt_with_variables}])
-        
-    
-    responses = batch_completion(
-        model = provider,
-        messages = messages,
-        temperature = 0.01
-    )
-    
+
+    responses = batch_completion(model=provider, messages=messages, temperature=0.01)
+
     all_blocks = []
-    for response in responses:    
+    for response in responses:
         try:
-            blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
+            blocks = extract_xml_data(["blocks"], response.choices[0].message.content)[
+                "blocks"
+            ]
             blocks = json.loads(blocks)
 
-        except Exception as e:
-            blocks = [{
-                "index": 0,
-                "tags": ["error"],
-                "content": ["Error extracting blocks from the HTML content. Choose another provider/model or try again."],
-                "questions": ["What went wrong during the block extraction process?"]
-            }]
+        except Exception:
+            blocks = [
+                {
+                    "index": 0,
+                    "tags": ["error"],
+                    "content": [
+                        "Error extracting blocks from the HTML content. Choose another provider/model or try again."
+                    ],
+                    "questions": [
+                        "What went wrong during the block extraction process?"
+                    ],
+                }
+            ]
         all_blocks.append(blocks)
-    
+
     return sum(all_blocks, [])
 
+
 def merge_chunks_based_on_token_threshold(chunks, token_threshold):
     """
     Merges small chunks into larger ones based on the total token threshold.
@@ -1178,23 +1401,28 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold):
     total_token_so_far = 0
 
     for chunk in chunks:
-        chunk_token_count = len(chunk.split()) * 1.3  # Estimate token count with a factor
+        chunk_token_count = (
+            len(chunk.split()) * 1.3
+        )  # Estimate token count with a factor
         if total_token_so_far + chunk_token_count < token_threshold:
             current_chunk.append(chunk)
             total_token_so_far += chunk_token_count
         else:
             if current_chunk:
-                merged_sections.append('\n\n'.join(current_chunk))
+                merged_sections.append("\n\n".join(current_chunk))
             current_chunk = [chunk]
             total_token_so_far = chunk_token_count
 
     # Add the last chunk if it exists
     if current_chunk:
-        merged_sections.append('\n\n'.join(current_chunk))
+        merged_sections.append("\n\n".join(current_chunk))
 
     return merged_sections
 
-def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list:
+
+def process_sections(
+    url: str, sections: list, provider: str, api_token: str, base_url=None
+) -> list:
     """
     Process sections of HTML content sequentially or in parallel.
 
@@ -1218,17 +1446,25 @@ def process_sections(url: str, sections: list, provider: str, api_token: str, ba
     if provider.startswith("groq/"):
         # Sequential processing with a delay
         for section in sections:
-            extracted_content.extend(extract_blocks(url, section, provider, api_token, base_url=base_url))
+            extracted_content.extend(
+                extract_blocks(url, section, provider, api_token, base_url=base_url)
+            )
             time.sleep(0.5)  # 500 ms delay between each processing
     else:
         # Parallel processing using ThreadPoolExecutor
         with ThreadPoolExecutor() as executor:
-            futures = [executor.submit(extract_blocks, url, section, provider, api_token, base_url=base_url) for section in sections]
+            futures = [
+                executor.submit(
+                    extract_blocks, url, section, provider, api_token, base_url=base_url
+                )
+                for section in sections
+            ]
             for future in as_completed(futures):
                 extracted_content.extend(future.result())
-    
+
     return extracted_content
 
+
 def wrap_text(draw, text, font, max_width):
     """
     Wrap text to fit within a specified width for rendering.
@@ -1252,11 +1488,14 @@ def wrap_text(draw, text, font, max_width):
     lines = []
     words = text.split()
     while words:
-        line = ''
-        while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width:
-            line += (words.pop(0) + ' ')
+        line = ""
+        while (
+            words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width
+        ):
+            line += words.pop(0) + " "
         lines.append(line)
-    return '\n'.join(lines)
+    return "\n".join(lines)
+
 
 def format_html(html_string):
     """
@@ -1274,16 +1513,17 @@ def format_html(html_string):
         str: The prettified HTML string.
     """
 
-    soup = BeautifulSoup(html_string, 'lxml.parser')
+    soup = BeautifulSoup(html_string, "lxml.parser")
     return soup.prettify()
 
+
 def fast_format_html(html_string):
     """
     A fast HTML formatter that uses string operations instead of parsing.
-    
+
     Args:
         html_string (str): The HTML string to format
-        
+
     Returns:
         str: The formatted HTML string
     """
@@ -1292,35 +1532,36 @@ def fast_format_html(html_string):
     indent_str = "  "  # Two spaces for indentation
     formatted = []
     in_content = False
-    
+
     # Split by < and > to separate tags and content
-    parts = html_string.replace('>', '>\n').replace('<', '\n<').split('\n')
-    
+    parts = html_string.replace(">", ">\n").replace("<", "\n<").split("\n")
+
     for part in parts:
         if not part.strip():
             continue
-            
+
         # Handle closing tags
-        if part.startswith('</'):
+        if part.startswith("</"):
             indent -= 1
             formatted.append(indent_str * indent + part)
-            
+
         # Handle self-closing tags
-        elif part.startswith('<') and part.endswith('/>'):
+        elif part.startswith("<") and part.endswith("/>"):
             formatted.append(indent_str * indent + part)
-            
+
         # Handle opening tags
-        elif part.startswith('<'):
+        elif part.startswith("<"):
             formatted.append(indent_str * indent + part)
             indent += 1
-            
+
         # Handle content between tags
         else:
             content = part.strip()
             if content:
                 formatted.append(indent_str * indent + content)
-    
-    return '\n'.join(formatted)
+
+    return "\n".join(formatted)
+
 
 def normalize_url(href, base_url):
     """Normalize URLs to ensure consistent format"""
@@ -1335,41 +1576,43 @@ def normalize_url(href, base_url):
     normalized = urljoin(base_url, href.strip())
     return normalized
 
+
 def normalize_url_tmp(href, base_url):
     """Normalize URLs to ensure consistent format"""
     # Extract protocol and domain from base URL
     try:
-        base_parts = base_url.split('/')
+        base_parts = base_url.split("/")
         protocol = base_parts[0]
         domain = base_parts[2]
     except IndexError:
         raise ValueError(f"Invalid base URL format: {base_url}")
-    
+
     # Handle special protocols
-    special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
+    special_protocols = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"}
     if any(href.lower().startswith(proto) for proto in special_protocols):
         return href.strip()
-        
+
     # Handle anchor links
-    if href.startswith('#'):
+    if href.startswith("#"):
         return f"{base_url}{href}"
-        
+
     # Handle protocol-relative URLs
-    if href.startswith('//'):
+    if href.startswith("//"):
         return f"{protocol}{href}"
-        
+
     # Handle root-relative URLs
-    if href.startswith('/'):
+    if href.startswith("/"):
         return f"{protocol}//{domain}{href}"
-        
+
     # Handle relative URLs
-    if not href.startswith(('http://', 'https://')):
+    if not href.startswith(("http://", "https://")):
         # Remove leading './' if present
-        href = href.lstrip('./')
+        href = href.lstrip("./")
         return f"{protocol}//{domain}/{href}"
-        
+
     return href.strip()
 
+
 def get_base_domain(url: str) -> str:
     """
     Extract the base domain from a given URL, handling common edge cases.
@@ -1390,25 +1633,37 @@ def get_base_domain(url: str) -> str:
         domain = urlparse(url).netloc.lower()
         if not domain:
             return ""
-            
+
         # Remove port if present
-        domain = domain.split(':')[0]
-        
+        domain = domain.split(":")[0]
+
         # Remove www
-        domain = re.sub(r'^www\.', '', domain)
-        
+        domain = re.sub(r"^www\.", "", domain)
+
         # Extract last two parts of domain (handles co.uk etc)
-        parts = domain.split('.')
+        parts = domain.split(".")
         if len(parts) > 2 and parts[-2] in {
-            'co', 'com', 'org', 'gov', 'edu', 'net', 
-            'mil', 'int', 'ac', 'ad', 'ae', 'af', 'ag'
+            "co",
+            "com",
+            "org",
+            "gov",
+            "edu",
+            "net",
+            "mil",
+            "int",
+            "ac",
+            "ad",
+            "ae",
+            "af",
+            "ag",
         }:
-            return '.'.join(parts[-3:])
-            
-        return '.'.join(parts[-2:])
+            return ".".join(parts[-3:])
+
+        return ".".join(parts[-2:])
     except Exception:
         return ""
 
+
 def is_external_url(url: str, base_domain: str) -> bool:
     """
     Extract the base domain from a given URL, handling common edge cases.
@@ -1424,24 +1679,25 @@ def is_external_url(url: str, base_domain: str) -> bool:
     Returns:
         str: The extracted base domain or an empty string if parsing fails.
     """
-    special = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
+    special = {"mailto:", "tel:", "ftp:", "file:", "data:", "javascript:"}
     if any(url.lower().startswith(p) for p in special):
         return True
-        
+
     try:
         parsed = urlparse(url)
         if not parsed.netloc:  # Relative URL
             return False
-            
+
         # Strip 'www.' from both domains for comparison
-        url_domain = parsed.netloc.lower().replace('www.', '')
-        base = base_domain.lower().replace('www.', '')
-        
+        url_domain = parsed.netloc.lower().replace("www.", "")
+        base = base_domain.lower().replace("www.", "")
+
         # Check if URL domain ends with base domain
         return not url_domain.endswith(base)
     except Exception:
         return False
 
+
 def clean_tokens(tokens: list[str]) -> list[str]:
     """
     Clean a list of tokens by removing noise, stop words, and short tokens.
@@ -1459,58 +1715,217 @@ def clean_tokens(tokens: list[str]) -> list[str]:
     """
 
     # Set of tokens to remove
-    noise = {'ccp', 'up', '↑', '▲', '⬆️', 'a', 'an', 'at', 'by', 'in', 'of', 'on', 'to', 'the'}
+    noise = {
+        "ccp",
+        "up",
+        "↑",
+        "▲",
+        "⬆️",
+        "a",
+        "an",
+        "at",
+        "by",
+        "in",
+        "of",
+        "on",
+        "to",
+        "the",
+    }
 
     STOP_WORDS = {
-        'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 
-        'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 
-        'to', 'was', 'were', 'will', 'with',
-        
+        "a",
+        "an",
+        "and",
+        "are",
+        "as",
+        "at",
+        "be",
+        "by",
+        "for",
+        "from",
+        "has",
+        "he",
+        "in",
+        "is",
+        "it",
+        "its",
+        "of",
+        "on",
+        "that",
+        "the",
+        "to",
+        "was",
+        "were",
+        "will",
+        "with",
         # Pronouns
-        'i', 'you', 'he', 'she', 'it', 'we', 'they',
-        'me', 'him', 'her', 'us', 'them',
-        'my', 'your', 'his', 'her', 'its', 'our', 'their',
-        'mine', 'yours', 'hers', 'ours', 'theirs',
-        'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves',
-        
+        "i",
+        "you",
+        "he",
+        "she",
+        "it",
+        "we",
+        "they",
+        "me",
+        "him",
+        "her",
+        "us",
+        "them",
+        "my",
+        "your",
+        "his",
+        "her",
+        "its",
+        "our",
+        "their",
+        "mine",
+        "yours",
+        "hers",
+        "ours",
+        "theirs",
+        "myself",
+        "yourself",
+        "himself",
+        "herself",
+        "itself",
+        "ourselves",
+        "themselves",
         # Common verbs
-        'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
-        'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
-        
+        "am",
+        "is",
+        "are",
+        "was",
+        "were",
+        "be",
+        "been",
+        "being",
+        "have",
+        "has",
+        "had",
+        "having",
+        "do",
+        "does",
+        "did",
+        "doing",
         # Prepositions
-        'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around',
-        'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond',
-        'by', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into',
-        'near', 'of', 'off', 'on', 'out', 'outside', 'over', 'past', 'through',
-        'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within',
-        
+        "about",
+        "above",
+        "across",
+        "after",
+        "against",
+        "along",
+        "among",
+        "around",
+        "at",
+        "before",
+        "behind",
+        "below",
+        "beneath",
+        "beside",
+        "between",
+        "beyond",
+        "by",
+        "down",
+        "during",
+        "except",
+        "for",
+        "from",
+        "in",
+        "inside",
+        "into",
+        "near",
+        "of",
+        "off",
+        "on",
+        "out",
+        "outside",
+        "over",
+        "past",
+        "through",
+        "to",
+        "toward",
+        "under",
+        "underneath",
+        "until",
+        "up",
+        "upon",
+        "with",
+        "within",
         # Conjunctions
-        'and', 'but', 'or', 'nor', 'for', 'yet', 'so',
-        'although', 'because', 'since', 'unless',
-        
+        "and",
+        "but",
+        "or",
+        "nor",
+        "for",
+        "yet",
+        "so",
+        "although",
+        "because",
+        "since",
+        "unless",
         # Articles
-        'a', 'an', 'the',
-        
+        "a",
+        "an",
+        "the",
         # Other common words
-        'this', 'that', 'these', 'those',
-        'what', 'which', 'who', 'whom', 'whose',
-        'when', 'where', 'why', 'how',
-        'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
-        'can', 'cannot', "can't", 'could', "couldn't",
-        'may', 'might', 'must', "mustn't",
-        'shall', 'should', "shouldn't",
-        'will', "won't", 'would', "wouldn't",
-        'not', "n't", 'no', 'nor', 'none'
-    }   
-   
+        "this",
+        "that",
+        "these",
+        "those",
+        "what",
+        "which",
+        "who",
+        "whom",
+        "whose",
+        "when",
+        "where",
+        "why",
+        "how",
+        "all",
+        "any",
+        "both",
+        "each",
+        "few",
+        "more",
+        "most",
+        "other",
+        "some",
+        "such",
+        "can",
+        "cannot",
+        "can't",
+        "could",
+        "couldn't",
+        "may",
+        "might",
+        "must",
+        "mustn't",
+        "shall",
+        "should",
+        "shouldn't",
+        "will",
+        "won't",
+        "would",
+        "wouldn't",
+        "not",
+        "n't",
+        "no",
+        "nor",
+        "none",
+    }
+
     # Single comprehension, more efficient than multiple passes
-    return [token for token in tokens 
-            if len(token) > 2 
-            and token not in noise 
-            and token not in STOP_WORDS
-            and not token.startswith('↑')
-            and not token.startswith('▲')
-            and not token.startswith('⬆')]
+    return [
+        token
+        for token in tokens
+        if len(token) > 2
+        and token not in noise
+        and token not in STOP_WORDS
+        and not token.startswith("↑")
+        and not token.startswith("▲")
+        and not token.startswith("⬆")
+    ]
+
 
 def profile_and_time(func):
     """
@@ -1532,103 +1947,108 @@ def profile_and_time(func):
     def wrapper(self, *args, **kwargs):
         # Start timer
         start_time = time.perf_counter()
-        
+
         # Setup profiler
         profiler = cProfile.Profile()
         profiler.enable()
-        
+
         # Run function
         result = func(self, *args, **kwargs)
-        
+
         # Stop profiler
         profiler.disable()
-        
+
         # Calculate elapsed time
         elapsed_time = time.perf_counter() - start_time
-        
+
         # Print timing
         print(f"[PROFILER] Scraping completed in {elapsed_time:.2f} seconds")
-        
+
         # Print profiling stats
         stats = pstats.Stats(profiler)
-        stats.sort_stats('cumulative')  # Sort by cumulative time
+        stats.sort_stats("cumulative")  # Sort by cumulative time
         stats.print_stats(20)  # Print top 20 time-consuming functions
-        
+
         return result
+
     return wrapper
 
+
 def generate_content_hash(content: str) -> str:
     """Generate a unique hash for content"""
     return xxhash.xxh64(content.encode()).hexdigest()
     # return hashlib.sha256(content.encode()).hexdigest()
 
+
 def ensure_content_dirs(base_path: str) -> Dict[str, str]:
     """Create content directories if they don't exist"""
     dirs = {
-        'html': 'html_content',
-        'cleaned': 'cleaned_html',
-        'markdown': 'markdown_content', 
-        'extracted': 'extracted_content',
-        'screenshots': 'screenshots',
-        'screenshot': 'screenshots'
+        "html": "html_content",
+        "cleaned": "cleaned_html",
+        "markdown": "markdown_content",
+        "extracted": "extracted_content",
+        "screenshots": "screenshots",
+        "screenshot": "screenshots",
     }
-    
+
     content_paths = {}
     for key, dirname in dirs.items():
         path = os.path.join(base_path, dirname)
         os.makedirs(path, exist_ok=True)
         content_paths[key] = path
-        
+
     return content_paths
 
+
 def configure_windows_event_loop():
     """
     Configure the Windows event loop to use ProactorEventLoop.
     This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses.
-    
+
     This function should only be called on Windows systems and before any async operations.
     On non-Windows systems, this function does nothing.
-    
+
     Example:
         ```python
         from crawl4ai.async_configs import configure_windows_event_loop
-        
+
         # Call this before any async operations if you're on Windows
         configure_windows_event_loop()
         ```
     """
-    if platform.system() == 'Windows':
+    if platform.system() == "Windows":
         asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
 
+
 def get_error_context(exc_info, context_lines: int = 5):
     """
     Extract error context with more reliable line number tracking.
-    
+
     Args:
         exc_info: The exception info from sys.exc_info()
         context_lines: Number of lines to show before and after the error
-    
+
     Returns:
         dict: Error context information
     """
     import traceback
     import linecache
     import os
-    
+
     # Get the full traceback
     tb = traceback.extract_tb(exc_info[2])
-    
+
     # Get the last frame (where the error occurred)
     last_frame = tb[-1]
     filename = last_frame.filename
     line_no = last_frame.lineno
     func_name = last_frame.name
-    
+
     # Get the source code context using linecache
     # This is more reliable than inspect.getsourcelines
     context_start = max(1, line_no - context_lines)
     context_end = line_no + context_lines + 1
-    
+
     # Build the context lines with line numbers
     context_lines = []
     for i in range(context_start, context_end):
@@ -1636,25 +2056,22 @@ def get_error_context(exc_info, context_lines: int = 5):
         if line:
             # Remove any trailing whitespace/newlines and add the pointer for error line
             line = line.rstrip()
-            pointer = '→' if i == line_no else ' '
+            pointer = "→" if i == line_no else " "
             context_lines.append(f"{i:4d} {pointer} {line}")
-    
+
     # Join the lines with newlines
-    code_context = '\n'.join(context_lines)
-    
+    code_context = "\n".join(context_lines)
+
     # Get relative path for cleaner output
     try:
         rel_path = os.path.relpath(filename)
     except ValueError:
         # Fallback if relpath fails (can happen on Windows with different drives)
         rel_path = filename
-    
+
     return {
         "filename": rel_path,
         "line_no": line_no,
         "function": func_name,
-        "code_context": code_context
+        "code_context": code_context,
     }
-    
-    
-    
\ No newline at end of file
diff --git a/crawl4ai/version_manager.py b/crawl4ai/version_manager.py
index 8ae2de2e..17d73faa 100644
--- a/crawl4ai/version_manager.py
+++ b/crawl4ai/version_manager.py
@@ -1,14 +1,14 @@
 # version_manager.py
-import os
 from pathlib import Path
 from packaging import version
 from . import __version__
 
+
 class VersionManager:
     def __init__(self):
         self.home_dir = Path.home() / ".crawl4ai"
         self.version_file = self.home_dir / "version.txt"
-        
+
     def get_installed_version(self):
         """Get the version recorded in home directory"""
         if not self.version_file.exists():
@@ -17,14 +17,13 @@ class VersionManager:
             return version.parse(self.version_file.read_text().strip())
         except:
             return None
-            
+
     def update_version(self):
         """Update the version file to current library version"""
         self.version_file.write_text(__version__.__version__)
-        
+
     def needs_update(self):
         """Check if database needs update based on version"""
         installed = self.get_installed_version()
         current = version.parse(__version__.__version__)
         return installed is None or installed < current
-
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
index a32a988d..a92ae6dd 100644
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -1,9 +1,10 @@
 import os, time
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 from pathlib import Path
 
 from .models import UrlModel, CrawlResult
-from .database import init_db, get_cached_url, cache_url, DB_PATH, flush_db
+from .database import init_db, get_cached_url, cache_url
 from .utils import *
 from .chunking_strategy import *
 from .extraction_strategy import *
@@ -14,31 +15,44 @@ from .content_scraping_strategy import WebScrapingStrategy
 from .config import *
 import warnings
 import json
-warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace "model_".')
+
+warnings.filterwarnings(
+    "ignore",
+    message='Field "model_name" has conflict with protected namespace "model_".',
+)
 
 
 class WebCrawler:
-    def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
-        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
+    def __init__(
+        self,
+        crawler_strategy: CrawlerStrategy = None,
+        always_by_pass_cache: bool = False,
+        verbose: bool = False,
+    ):
+        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(
+            verbose=verbose
+        )
         self.always_by_pass_cache = always_by_pass_cache
-        self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
+        self.crawl4ai_folder = os.path.join(
+            os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
+        )
         os.makedirs(self.crawl4ai_folder, exist_ok=True)
         os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
         init_db()
         self.ready = False
-        
+
     def warmup(self):
         print("[LOG] 🌤️  Warming up the WebCrawler")
         self.run(
-            url='https://google.com/',
+            url="https://google.com/",
             word_count_threshold=5,
             extraction_strategy=NoExtractionStrategy(),
             bypass_cache=False,
-            verbose=False
+            verbose=False,
         )
         self.ready = True
         print("[LOG] 🌞 WebCrawler is ready to crawl")
-        
+
     def fetch_page(
         self,
         url_model: UrlModel,
@@ -80,6 +94,7 @@ class WebCrawler:
         **kwargs,
     ) -> List[CrawlResult]:
         extraction_strategy = extraction_strategy or NoExtractionStrategy()
+
         def fetch_page_wrapper(url_model, *args, **kwargs):
             return self.fetch_page(url_model, *args, **kwargs)
 
@@ -104,150 +119,176 @@ class WebCrawler:
         return results
 
     def run(
-            self,
-            url: str,
-            word_count_threshold=MIN_WORD_THRESHOLD,
-            extraction_strategy: ExtractionStrategy = None,
-            chunking_strategy: ChunkingStrategy = RegexChunking(),
-            bypass_cache: bool = False,
-            css_selector: str = None,
-            screenshot: bool = False,
-            user_agent: str = None,
-            verbose=True,
-            **kwargs,
-        ) -> CrawlResult:
-            try:
-                extraction_strategy = extraction_strategy or NoExtractionStrategy()
-                extraction_strategy.verbose = verbose
-                if not isinstance(extraction_strategy, ExtractionStrategy):
-                    raise ValueError("Unsupported extraction strategy")
-                if not isinstance(chunking_strategy, ChunkingStrategy):
-                    raise ValueError("Unsupported chunking strategy")
-                
-                word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
+        self,
+        url: str,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        bypass_cache: bool = False,
+        css_selector: str = None,
+        screenshot: bool = False,
+        user_agent: str = None,
+        verbose=True,
+        **kwargs,
+    ) -> CrawlResult:
+        try:
+            extraction_strategy = extraction_strategy or NoExtractionStrategy()
+            extraction_strategy.verbose = verbose
+            if not isinstance(extraction_strategy, ExtractionStrategy):
+                raise ValueError("Unsupported extraction strategy")
+            if not isinstance(chunking_strategy, ChunkingStrategy):
+                raise ValueError("Unsupported chunking strategy")
 
-                cached = None
-                screenshot_data = None
-                extracted_content = None
-                if not bypass_cache and not self.always_by_pass_cache:
-                    cached = get_cached_url(url)
-                
-                if kwargs.get("warmup", True) and not self.ready:
-                    return None
-                
-                if cached:
-                    html = sanitize_input_encode(cached[1])
-                    extracted_content = sanitize_input_encode(cached[4])
-                    if screenshot:
-                        screenshot_data = cached[9]
-                        if not screenshot_data:
-                            cached = None
-                
-                if not cached or not html:
-                    if user_agent:
-                        self.crawler_strategy.update_user_agent(user_agent)
-                    t1 = time.time()
-                    html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
-                    t2 = time.time()
-                    if verbose:
-                        print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds")
-                    if screenshot:
-                        screenshot_data = self.crawler_strategy.take_screenshot()
+            word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
 
-                
-                crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
-                crawl_result.success = bool(html)
-                return crawl_result
-            except Exception as e:
-                if not hasattr(e, "msg"):
-                    e.msg = str(e)
-                print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")    
-                return CrawlResult(url=url, html="", success=False, error_message=e.msg)
+            cached = None
+            screenshot_data = None
+            extracted_content = None
+            if not bypass_cache and not self.always_by_pass_cache:
+                cached = get_cached_url(url)
+
+            if kwargs.get("warmup", True) and not self.ready:
+                return None
+
+            if cached:
+                html = sanitize_input_encode(cached[1])
+                extracted_content = sanitize_input_encode(cached[4])
+                if screenshot:
+                    screenshot_data = cached[9]
+                    if not screenshot_data:
+                        cached = None
+
+            if not cached or not html:
+                if user_agent:
+                    self.crawler_strategy.update_user_agent(user_agent)
+                t1 = time.time()
+                html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
+                t2 = time.time()
+                if verbose:
+                    print(
+                        f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
+                    )
+                if screenshot:
+                    screenshot_data = self.crawler_strategy.take_screenshot()
+
+            crawl_result = self.process_html(
+                url,
+                html,
+                extracted_content,
+                word_count_threshold,
+                extraction_strategy,
+                chunking_strategy,
+                css_selector,
+                screenshot_data,
+                verbose,
+                bool(cached),
+                **kwargs,
+            )
+            crawl_result.success = bool(html)
+            return crawl_result
+        except Exception as e:
+            if not hasattr(e, "msg"):
+                e.msg = str(e)
+            print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
+            return CrawlResult(url=url, html="", success=False, error_message=e.msg)
 
     def process_html(
-            self,
-            url: str,
-            html: str,
-            extracted_content: str,
-            word_count_threshold: int,
-            extraction_strategy: ExtractionStrategy,
-            chunking_strategy: ChunkingStrategy,
-            css_selector: str,
-            screenshot: bool,
-            verbose: bool,
-            is_cached: bool,
-            **kwargs,
-        ) -> CrawlResult:
-            t = time.time()
-            # Extract content from HTML
-            try:
-                t1 = time.time()
-                scrapping_strategy = WebScrapingStrategy()
-                extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]}
-                result = scrapping_strategy.scrap(
-                    url,
-                    html,
-                    word_count_threshold=word_count_threshold,
-                    css_selector=css_selector,
-                    only_text=kwargs.get("only_text", False),
-                    image_description_min_word_threshold=kwargs.get(
-                        "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
-                    ),
-                    **extra_params,
+        self,
+        url: str,
+        html: str,
+        extracted_content: str,
+        word_count_threshold: int,
+        extraction_strategy: ExtractionStrategy,
+        chunking_strategy: ChunkingStrategy,
+        css_selector: str,
+        screenshot: bool,
+        verbose: bool,
+        is_cached: bool,
+        **kwargs,
+    ) -> CrawlResult:
+        t = time.time()
+        # Extract content from HTML
+        try:
+            t1 = time.time()
+            scrapping_strategy = WebScrapingStrategy()
+            extra_params = {
+                k: v
+                for k, v in kwargs.items()
+                if k not in ["only_text", "image_description_min_word_threshold"]
+            }
+            result = scrapping_strategy.scrap(
+                url,
+                html,
+                word_count_threshold=word_count_threshold,
+                css_selector=css_selector,
+                only_text=kwargs.get("only_text", False),
+                image_description_min_word_threshold=kwargs.get(
+                    "image_description_min_word_threshold",
+                    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+                ),
+                **extra_params,
+            )
+
+            # result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
+            if verbose:
+                print(
+                    f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds"
                 )
-                
-                # result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
-                if verbose:
-                    print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds")
-                
-                if result is None:
-                    raise ValueError(f"Failed to extract content from the website: {url}")
-            except InvalidCSSSelectorError as e:
-                raise ValueError(str(e))
-            
-            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
-            markdown = sanitize_input_encode(result.get("markdown", ""))
-            media = result.get("media", [])
-            links = result.get("links", [])
-            metadata = result.get("metadata", {})
-                        
-            if extracted_content is None:
-                if verbose:
-                    print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
 
-                sections = chunking_strategy.chunk(markdown)
-                extracted_content = extraction_strategy.run(url, sections)
-                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
+            if result is None:
+                raise ValueError(f"Failed to extract content from the website: {url}")
+        except InvalidCSSSelectorError as e:
+            raise ValueError(str(e))
 
-                if verbose:
-                    print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds.")
-                
-            screenshot = None if not screenshot else screenshot
-            
-            if not is_cached:
-                cache_url(
-                    url,
-                    html,
-                    cleaned_html,
-                    markdown,
-                    extracted_content,
-                    True,
-                    json.dumps(media),
-                    json.dumps(links),
-                    json.dumps(metadata),
-                    screenshot=screenshot,
-                )                
-            
-            return CrawlResult(
-                url=url,
-                html=html,
-                cleaned_html=format_html(cleaned_html),
-                markdown=markdown,
-                media=media,
-                links=links,
-                metadata=metadata,
+        cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+        markdown = sanitize_input_encode(result.get("markdown", ""))
+        media = result.get("media", [])
+        links = result.get("links", [])
+        metadata = result.get("metadata", {})
+
+        if extracted_content is None:
+            if verbose:
+                print(
+                    f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}"
+                )
+
+            sections = chunking_strategy.chunk(markdown)
+            extracted_content = extraction_strategy.run(url, sections)
+            extracted_content = json.dumps(
+                extracted_content, indent=4, default=str, ensure_ascii=False
+            )
+
+            if verbose:
+                print(
+                    f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds."
+                )
+
+        screenshot = None if not screenshot else screenshot
+
+        if not is_cached:
+            cache_url(
+                url,
+                html,
+                cleaned_html,
+                markdown,
+                extracted_content,
+                True,
+                json.dumps(media),
+                json.dumps(links),
+                json.dumps(metadata),
                 screenshot=screenshot,
-                extracted_content=extracted_content,
-                success=True,
-                error_message="",
-            )
\ No newline at end of file
+            )
+
+        return CrawlResult(
+            url=url,
+            html=html,
+            cleaned_html=format_html(cleaned_html),
+            markdown=markdown,
+            media=media,
+            links=links,
+            metadata=metadata,
+            screenshot=screenshot,
+            extracted_content=extracted_content,
+            success=True,
+            error_message="",
+        )
diff --git a/docs/examples/amazon_product_extraction_direct_url.py b/docs/examples/amazon_product_extraction_direct_url.py
index 769c479e..ec734245 100644
--- a/docs/examples/amazon_product_extraction_direct_url.py
+++ b/docs/examples/amazon_product_extraction_direct_url.py
@@ -9,13 +9,11 @@ from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 import json
 
+
 async def extract_amazon_products():
     # Initialize browser config
-    browser_config = BrowserConfig(
-        browser_type="chromium",
-        headless=True
-    )
-    
+    browser_config = BrowserConfig(browser_type="chromium", headless=True)
+
     # Initialize crawler config with JSON CSS extraction strategy
     crawler_config = CrawlerRunConfig(
         extraction_strategy=JsonCssExtractionStrategy(
@@ -27,74 +25,70 @@ async def extract_amazon_products():
                         "name": "asin",
                         "selector": "",
                         "type": "attribute",
-                        "attribute": "data-asin"
-                    },
-                    {
-                        "name": "title",
-                        "selector": "h2 a span",
-                        "type": "text"
+                        "attribute": "data-asin",
                     },
+                    {"name": "title", "selector": "h2 a span", "type": "text"},
                     {
                         "name": "url",
                         "selector": "h2 a",
                         "type": "attribute",
-                        "attribute": "href"
+                        "attribute": "href",
                     },
                     {
                         "name": "image",
                         "selector": ".s-image",
                         "type": "attribute",
-                        "attribute": "src"
+                        "attribute": "src",
                     },
                     {
                         "name": "rating",
                         "selector": ".a-icon-star-small .a-icon-alt",
-                        "type": "text"
+                        "type": "text",
                     },
                     {
                         "name": "reviews_count",
                         "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
-                        "type": "text"
+                        "type": "text",
                     },
                     {
                         "name": "price",
                         "selector": ".a-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                     },
                     {
                         "name": "original_price",
                         "selector": ".a-price.a-text-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                     },
                     {
                         "name": "sponsored",
                         "selector": ".puis-sponsored-label-text",
-                        "type": "exists"
+                        "type": "exists",
                     },
                     {
                         "name": "delivery_info",
                         "selector": "[data-cy='delivery-recipe'] .a-color-base",
                         "type": "text",
-                        "multiple": True
-                    }
-                ]
+                        "multiple": True,
+                    },
+                ],
             }
         )
     )
 
     # Example search URL (you should replace with your actual Amazon URL)
     url = "https://www.amazon.com/s?k=Samsung+Galaxy+Tab"
-    
+
     # Use context manager for proper resource handling
     async with AsyncWebCrawler(config=browser_config) as crawler:
         # Extract the data
         result = await crawler.arun(url=url, config=crawler_config)
-        
+
         # Process and print the results
         if result and result.extracted_content:
             # Parse the JSON string into a list of products
             products = json.loads(result.extracted_content)
-            
+
             # Process each product in the list
             for product in products:
                 print("\nProduct Details:")
@@ -105,10 +99,12 @@ async def extract_amazon_products():
                 print(f"Rating: {product.get('rating')}")
                 print(f"Reviews: {product.get('reviews_count')}")
                 print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
-                if product.get('delivery_info'):
+                if product.get("delivery_info"):
                     print(f"Delivery: {' '.join(product['delivery_info'])}")
                 print("-" * 80)
 
+
 if __name__ == "__main__":
     import asyncio
+
     asyncio.run(extract_amazon_products())
diff --git a/docs/examples/amazon_product_extraction_using_hooks.py b/docs/examples/amazon_product_extraction_using_hooks.py
index a17d60c5..5118b5d9 100644
--- a/docs/examples/amazon_product_extraction_using_hooks.py
+++ b/docs/examples/amazon_product_extraction_using_hooks.py
@@ -10,17 +10,17 @@ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 import json
 from playwright.async_api import Page, BrowserContext
 
+
 async def extract_amazon_products():
     # Initialize browser config
     browser_config = BrowserConfig(
         # browser_type="chromium",
         headless=True
     )
-    
+
     # Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button
     crawler_config = CrawlerRunConfig(
         cache_mode=CacheMode.BYPASS,
-
         extraction_strategy=JsonCssExtractionStrategy(
             schema={
                 "name": "Amazon Product Search Results",
@@ -30,102 +30,105 @@ async def extract_amazon_products():
                         "name": "asin",
                         "selector": "",
                         "type": "attribute",
-                        "attribute": "data-asin"
-                    },
-                    {
-                        "name": "title",
-                        "selector": "h2 a span",
-                        "type": "text"
+                        "attribute": "data-asin",
                     },
+                    {"name": "title", "selector": "h2 a span", "type": "text"},
                     {
                         "name": "url",
                         "selector": "h2 a",
                         "type": "attribute",
-                        "attribute": "href"
+                        "attribute": "href",
                     },
                     {
                         "name": "image",
                         "selector": ".s-image",
                         "type": "attribute",
-                        "attribute": "src"
+                        "attribute": "src",
                     },
                     {
                         "name": "rating",
                         "selector": ".a-icon-star-small .a-icon-alt",
-                        "type": "text"
+                        "type": "text",
                     },
                     {
                         "name": "reviews_count",
                         "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
-                        "type": "text"
+                        "type": "text",
                     },
                     {
                         "name": "price",
                         "selector": ".a-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                     },
                     {
                         "name": "original_price",
                         "selector": ".a-price.a-text-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                     },
                     {
                         "name": "sponsored",
                         "selector": ".puis-sponsored-label-text",
-                        "type": "exists"
+                        "type": "exists",
                     },
                     {
                         "name": "delivery_info",
                         "selector": "[data-cy='delivery-recipe'] .a-color-base",
                         "type": "text",
-                        "multiple": True
-                    }
-                ]
+                        "multiple": True,
+                    },
+                ],
             }
-        )
+        ),
     )
 
     url = "https://www.amazon.com/"
-    
-    async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
+
+    async def after_goto(
+        page: Page, context: BrowserContext, url: str, response: dict, **kwargs
+    ):
         """Hook called after navigating to each URL"""
         print(f"[HOOK] after_goto - Successfully loaded: {url}")
-        
+
         try:
             # Wait for search box to be available
-            search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000)
-            
+            search_box = await page.wait_for_selector(
+                "#twotabsearchtextbox", timeout=1000
+            )
+
             # Type the search query
-            await search_box.fill('Samsung Galaxy Tab')
-            
+            await search_box.fill("Samsung Galaxy Tab")
+
             # Get the search button and prepare for navigation
-            search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000)
-            
+            search_button = await page.wait_for_selector(
+                "#nav-search-submit-button", timeout=1000
+            )
+
             # Click with navigation waiting
             await search_button.click()
-            
+
             # Wait for search results to load
-            await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000)
+            await page.wait_for_selector(
+                '[data-component-type="s-search-result"]', timeout=10000
+            )
             print("[HOOK] Search completed and results loaded!")
-            
+
         except Exception as e:
             print(f"[HOOK] Error during search operation: {str(e)}")
-            
-        return page    
-    
+
+        return page
+
     # Use context manager for proper resource handling
     async with AsyncWebCrawler(config=browser_config) as crawler:
-        
         crawler.crawler_strategy.set_hook("after_goto", after_goto)
-        
+
         # Extract the data
         result = await crawler.arun(url=url, config=crawler_config)
-        
+
         # Process and print the results
         if result and result.extracted_content:
             # Parse the JSON string into a list of products
             products = json.loads(result.extracted_content)
-            
+
             # Process each product in the list
             for product in products:
                 print("\nProduct Details:")
@@ -136,10 +139,12 @@ async def extract_amazon_products():
                 print(f"Rating: {product.get('rating')}")
                 print(f"Reviews: {product.get('reviews_count')}")
                 print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
-                if product.get('delivery_info'):
+                if product.get("delivery_info"):
                     print(f"Delivery: {' '.join(product['delivery_info'])}")
                 print("-" * 80)
 
+
 if __name__ == "__main__":
     import asyncio
+
     asyncio.run(extract_amazon_products())
diff --git a/docs/examples/amazon_product_extraction_using_use_javascript.py b/docs/examples/amazon_product_extraction_using_use_javascript.py
index 15e5d6f5..e412c931 100644
--- a/docs/examples/amazon_product_extraction_using_use_javascript.py
+++ b/docs/examples/amazon_product_extraction_using_use_javascript.py
@@ -8,7 +8,7 @@ from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 import json
-from playwright.async_api import Page, BrowserContext
+
 
 async def extract_amazon_products():
     # Initialize browser config
@@ -16,7 +16,7 @@ async def extract_amazon_products():
         # browser_type="chromium",
         headless=True
     )
-    
+
     js_code_to_search = """
         const task = async () => {
             document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
@@ -30,7 +30,7 @@ async def extract_amazon_products():
     """
     crawler_config = CrawlerRunConfig(
         cache_mode=CacheMode.BYPASS,
-        js_code = js_code_to_search,
+        js_code=js_code_to_search,
         wait_for='css:[data-component-type="s-search-result"]',
         extraction_strategy=JsonCssExtractionStrategy(
             schema={
@@ -41,75 +41,70 @@ async def extract_amazon_products():
                         "name": "asin",
                         "selector": "",
                         "type": "attribute",
-                        "attribute": "data-asin"
-                    },
-                    {
-                        "name": "title",
-                        "selector": "h2 a span",
-                        "type": "text"
+                        "attribute": "data-asin",
                     },
+                    {"name": "title", "selector": "h2 a span", "type": "text"},
                     {
                         "name": "url",
                         "selector": "h2 a",
                         "type": "attribute",
-                        "attribute": "href"
+                        "attribute": "href",
                     },
                     {
                         "name": "image",
                         "selector": ".s-image",
                         "type": "attribute",
-                        "attribute": "src"
+                        "attribute": "src",
                     },
                     {
                         "name": "rating",
                         "selector": ".a-icon-star-small .a-icon-alt",
-                        "type": "text"
+                        "type": "text",
                     },
                     {
                         "name": "reviews_count",
                         "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
-                        "type": "text"
+                        "type": "text",
                     },
                     {
                         "name": "price",
                         "selector": ".a-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                     },
                     {
                         "name": "original_price",
                         "selector": ".a-price.a-text-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                     },
                     {
                         "name": "sponsored",
                         "selector": ".puis-sponsored-label-text",
-                        "type": "exists"
+                        "type": "exists",
                     },
                     {
                         "name": "delivery_info",
                         "selector": "[data-cy='delivery-recipe'] .a-color-base",
                         "type": "text",
-                        "multiple": True
-                    }
-                ]
+                        "multiple": True,
+                    },
+                ],
             }
-        )
+        ),
     )
 
     # Example search URL (you should replace with your actual Amazon URL)
     url = "https://www.amazon.com/"
- 
-    
+
     # Use context manager for proper resource handling
     async with AsyncWebCrawler(config=browser_config) as crawler:
         # Extract the data
         result = await crawler.arun(url=url, config=crawler_config)
-        
+
         # Process and print the results
         if result and result.extracted_content:
             # Parse the JSON string into a list of products
             products = json.loads(result.extracted_content)
-            
+
             # Process each product in the list
             for product in products:
                 print("\nProduct Details:")
@@ -120,10 +115,12 @@ async def extract_amazon_products():
                 print(f"Rating: {product.get('rating')}")
                 print(f"Reviews: {product.get('reviews_count')}")
                 print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
-                if product.get('delivery_info'):
+                if product.get("delivery_info"):
                     print(f"Delivery: {' '.join(product['delivery_info'])}")
                 print("-" * 80)
 
+
 if __name__ == "__main__":
     import asyncio
+
     asyncio.run(extract_amazon_products())
diff --git a/docs/examples/async_webcrawler_multiple_urls_example.py b/docs/examples/async_webcrawler_multiple_urls_example.py
index 1d63ac80..52309d13 100644
--- a/docs/examples/async_webcrawler_multiple_urls_example.py
+++ b/docs/examples/async_webcrawler_multiple_urls_example.py
@@ -1,12 +1,16 @@
 # File: async_webcrawler_multiple_urls_example.py
 import os, sys
+
 # append 2 parent directories to sys.path to import crawl4ai
-parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+parent_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
 sys.path.append(parent_dir)
 
 import asyncio
 from crawl4ai import AsyncWebCrawler
 
+
 async def main():
     # Initialize the AsyncWebCrawler
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -16,7 +20,7 @@ async def main():
             "https://python.org",
             "https://github.com",
             "https://stackoverflow.com",
-            "https://news.ycombinator.com"
+            "https://news.ycombinator.com",
         ]
 
         # Set up crawling parameters
@@ -27,7 +31,7 @@ async def main():
             urls=urls,
             word_count_threshold=word_count_threshold,
             bypass_cache=True,
-            verbose=True
+            verbose=True,
         )
 
         # Process the results
@@ -36,7 +40,9 @@ async def main():
                 print(f"Successfully crawled: {result.url}")
                 print(f"Title: {result.metadata.get('title', 'N/A')}")
                 print(f"Word count: {len(result.markdown.split())}")
-                print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}")
+                print(
+                    f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}"
+                )
                 print(f"Number of images: {len(result.media.get('images', []))}")
                 print("---")
             else:
@@ -44,5 +50,6 @@ async def main():
                 print(f"Error: {result.error_message}")
                 print("---")
 
+
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/docs/examples/browser_optimization_example.py b/docs/examples/browser_optimization_example.py
index f57dc147..73637a71 100644
--- a/docs/examples/browser_optimization_example.py
+++ b/docs/examples/browser_optimization_example.py
@@ -6,10 +6,8 @@ This example demonstrates optimal browser usage patterns in Crawl4AI:
 """
 
 import asyncio
-import os
 from typing import List
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 
 
diff --git a/docs/examples/crawlai_vs_firecrawl.py b/docs/examples/crawlai_vs_firecrawl.py
index b50b06da..f8b70dc7 100644
--- a/docs/examples/crawlai_vs_firecrawl.py
+++ b/docs/examples/crawlai_vs_firecrawl.py
@@ -1,31 +1,32 @@
 import os, time
+
 # append the path to the root of the project
 import sys
 import asyncio
-sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 from firecrawl import FirecrawlApp
 from crawl4ai import AsyncWebCrawler
-__data__ = os.path.join(os.path.dirname(__file__), '..', '..') + '/.data'
+
+__data__ = os.path.join(os.path.dirname(__file__), "..", "..") + "/.data"
+
 
 async def compare():
-    app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
+    app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
 
     # Tet Firecrawl with a simple crawl
     start = time.time()
     scrape_status = app.scrape_url(
-    'https://www.nbcnews.com/business',
-    params={'formats': ['markdown', 'html']}
+        "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
     )
     end = time.time()
     print(f"Time taken: {end - start} seconds")
-    print(len(scrape_status['markdown']))
+    print(len(scrape_status["markdown"]))
     # save the markdown content with provider name
     with open(f"{__data__}/firecrawl_simple.md", "w") as f:
-        f.write(scrape_status['markdown'])
+        f.write(scrape_status["markdown"])
     # Count how many "cldnry.s-nbcnews.com" are in the markdown
-    print(scrape_status['markdown'].count("cldnry.s-nbcnews.com"))
-    
-
+    print(scrape_status["markdown"].count("cldnry.s-nbcnews.com"))
 
     async with AsyncWebCrawler() as crawler:
         start = time.time()
@@ -33,13 +34,13 @@ async def compare():
             url="https://www.nbcnews.com/business",
             # js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
             word_count_threshold=0,
-            bypass_cache=True, 
-            verbose=False
+            bypass_cache=True,
+            verbose=False,
         )
         end = time.time()
         print(f"Time taken: {end - start} seconds")
         print(len(result.markdown))
-        # save the markdown content with provider name  
+        # save the markdown content with provider name
         with open(f"{__data__}/crawl4ai_simple.md", "w") as f:
             f.write(result.markdown)
         # count how many "cldnry.s-nbcnews.com" are in the markdown
@@ -48,10 +49,12 @@ async def compare():
         start = time.time()
         result = await crawler.arun(
             url="https://www.nbcnews.com/business",
-            js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
+            js_code=[
+                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
+            ],
             word_count_threshold=0,
-            bypass_cache=True, 
-            verbose=False
+            bypass_cache=True,
+            verbose=False,
         )
         end = time.time()
         print(f"Time taken: {end - start} seconds")
@@ -61,7 +64,7 @@ async def compare():
             f.write(result.markdown)
         # count how many "cldnry.s-nbcnews.com" are in the markdown
         print(result.markdown.count("cldnry.s-nbcnews.com"))
-        
+
+
 if __name__ == "__main__":
     asyncio.run(compare())
-    
\ No newline at end of file
diff --git a/docs/examples/dispatcher_example.py b/docs/examples/dispatcher_example.py
new file mode 100644
index 00000000..c9708ccc
--- /dev/null
+++ b/docs/examples/dispatcher_example.py
@@ -0,0 +1,135 @@
+import asyncio
+import time
+from rich import print
+from rich.table import Table
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    MemoryAdaptiveDispatcher,
+    SemaphoreDispatcher,
+    RateLimiter,
+    CrawlerMonitor,
+    DisplayMode,
+    CacheMode,
+)
+
+
+async def memory_adaptive(urls, browser_config, run_config):
+    """Memory adaptive crawler with monitoring"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=70.0,
+            max_session_permit=10,
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
+    """Memory adaptive crawler with rate limiting"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=70.0,
+            max_session_permit=10,
+            rate_limiter=RateLimiter(
+                base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
+            ),
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+async def semaphore(urls, browser_config, run_config):
+    """Basic semaphore crawler"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = SemaphoreDispatcher(
+            semaphore_count=5,
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+async def semaphore_with_rate_limit(urls, browser_config, run_config):
+    """Semaphore crawler with rate limiting"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = SemaphoreDispatcher(
+            semaphore_count=5,
+            rate_limiter=RateLimiter(
+                base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
+            ),
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+def create_performance_table(results):
+    """Creates a rich table showing performance results"""
+    table = Table(title="Crawler Strategy Performance Comparison")
+    table.add_column("Strategy", style="cyan")
+    table.add_column("URLs Crawled", justify="right", style="green")
+    table.add_column("Time (seconds)", justify="right", style="yellow")
+    table.add_column("URLs/second", justify="right", style="magenta")
+
+    sorted_results = sorted(results.items(), key=lambda x: x[1][1])
+
+    for strategy, (urls_crawled, duration) in sorted_results:
+        urls_per_second = urls_crawled / duration
+        table.add_row(
+            strategy, str(urls_crawled), f"{duration:.2f}", f"{urls_per_second:.2f}"
+        )
+
+    return table
+
+
+async def main():
+    urls = [f"https://example.com/page{i}" for i in range(1, 20)]
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    results = {
+        "Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
+        "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(
+            urls, browser_config, run_config
+        ),
+        "Semaphore": await semaphore(urls, browser_config, run_config),
+        "Semaphore + Rate Limit": await semaphore_with_rate_limit(
+            urls, browser_config, run_config
+        ),
+    }
+
+    table = create_performance_table(results)
+    print("\nPerformance Summary:")
+    print(table)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py
index 48acc809..fe1d0727 100644
--- a/docs/examples/docker_example.py
+++ b/docs/examples/docker_example.py
@@ -6,63 +6,80 @@ import base64
 import os
 from typing import Dict, Any
 
+
 class Crawl4AiTester:
     def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
         self.base_url = base_url
-        self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"  # Check environment variable as fallback
-        self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
-        
-    def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
+        self.api_token = (
+            api_token or os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
+        )  # Check environment variable as fallback
+        self.headers = (
+            {"Authorization": f"Bearer {self.api_token}"} if self.api_token else {}
+        )
+
+    def submit_and_wait(
+        self, request_data: Dict[str, Any], timeout: int = 300
+    ) -> Dict[str, Any]:
         # Submit crawl job
-        response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
+        response = requests.post(
+            f"{self.base_url}/crawl", json=request_data, headers=self.headers
+        )
         if response.status_code == 403:
             raise Exception("API token is invalid or missing")
         task_id = response.json()["task_id"]
         print(f"Task ID: {task_id}")
-        
+
         # Poll for result
         start_time = time.time()
         while True:
             if time.time() - start_time > timeout:
-                raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
-                
-            result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
+                raise TimeoutError(
+                    f"Task {task_id} did not complete within {timeout} seconds"
+                )
+
+            result = requests.get(
+                f"{self.base_url}/task/{task_id}", headers=self.headers
+            )
             status = result.json()
-            
+
             if status["status"] == "failed":
                 print("Task failed:", status.get("error"))
                 raise Exception(f"Task failed: {status.get('error')}")
-                
+
             if status["status"] == "completed":
                 return status
-                
+
             time.sleep(2)
-            
+
     def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
-        response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
+        response = requests.post(
+            f"{self.base_url}/crawl_sync",
+            json=request_data,
+            headers=self.headers,
+            timeout=60,
+        )
         if response.status_code == 408:
             raise TimeoutError("Task did not complete within server timeout")
         response.raise_for_status()
         return response.json()
-    
+
     def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
         """Directly crawl without using task queue"""
         response = requests.post(
-            f"{self.base_url}/crawl_direct", 
-            json=request_data, 
-            headers=self.headers
+            f"{self.base_url}/crawl_direct", json=request_data, headers=self.headers
         )
         response.raise_for_status()
         return response.json()
 
+
 def test_docker_deployment(version="basic"):
     tester = Crawl4AiTester(
-        base_url="http://localhost:11235" ,
+        base_url="http://localhost:11235",
         # base_url="https://api.crawl4ai.com" # just for example
         # api_token="test" # just for example
     )
     print(f"Testing Crawl4AI Docker {version} version")
-    
+
     # Health check with timeout and retry
     max_retries = 5
     for i in range(max_retries):
@@ -70,19 +87,19 @@ def test_docker_deployment(version="basic"):
             health = requests.get(f"{tester.base_url}/health", timeout=10)
             print("Health check:", health.json())
             break
-        except requests.exceptions.RequestException as e:
+        except requests.exceptions.RequestException:
             if i == max_retries - 1:
                 print(f"Failed to connect after {max_retries} attempts")
                 sys.exit(1)
             print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
             time.sleep(5)
-    
+
     # Test cases based on version
     test_basic_crawl_direct(tester)
     test_basic_crawl(tester)
     test_basic_crawl(tester)
     test_basic_crawl_sync(tester)
-    
+
     if version in ["full", "transformer"]:
         test_cosine_extraction(tester)
 
@@ -92,49 +109,52 @@ def test_docker_deployment(version="basic"):
     test_llm_extraction(tester)
     test_llm_with_ollama(tester)
     test_screenshot(tester)
-    
+
 
 def test_basic_crawl(tester: Crawl4AiTester):
     print("\n=== Testing Basic Crawl ===")
     request = {
         "urls": "https://www.nbcnews.com/business",
-        "priority": 10, 
-        "session_id": "test"
+        "priority": 10,
+        "session_id": "test",
     }
-    
+
     result = tester.submit_and_wait(request)
     print(f"Basic crawl result length: {len(result['result']['markdown'])}")
     assert result["result"]["success"]
     assert len(result["result"]["markdown"]) > 0
 
+
 def test_basic_crawl_sync(tester: Crawl4AiTester):
     print("\n=== Testing Basic Crawl (Sync) ===")
     request = {
         "urls": "https://www.nbcnews.com/business",
         "priority": 10,
-        "session_id": "test"
+        "session_id": "test",
     }
-    
+
     result = tester.submit_sync(request)
     print(f"Basic crawl result length: {len(result['result']['markdown'])}")
-    assert result['status'] == 'completed'
-    assert result['result']['success']
-    assert len(result['result']['markdown']) > 0
-    
+    assert result["status"] == "completed"
+    assert result["result"]["success"]
+    assert len(result["result"]["markdown"]) > 0
+
+
 def test_basic_crawl_direct(tester: Crawl4AiTester):
     print("\n=== Testing Basic Crawl (Direct) ===")
     request = {
         "urls": "https://www.nbcnews.com/business",
         "priority": 10,
         # "session_id": "test"
-        "cache_mode": "bypass"  # or "enabled", "disabled", "read_only", "write_only"
+        "cache_mode": "bypass",  # or "enabled", "disabled", "read_only", "write_only"
     }
-    
+
     result = tester.crawl_direct(request)
     print(f"Basic crawl result length: {len(result['result']['markdown'])}")
-    assert result['result']['success']
-    assert len(result['result']['markdown']) > 0
-    
+    assert result["result"]["success"]
+    assert len(result["result"]["markdown"]) > 0
+
+
 def test_js_execution(tester: Crawl4AiTester):
     print("\n=== Testing JS Execution ===")
     request = {
@@ -144,32 +164,29 @@ def test_js_execution(tester: Crawl4AiTester):
             "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
         ],
         "wait_for": "article.tease-card:nth-child(10)",
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
     }
-    
+
     result = tester.submit_and_wait(request)
     print(f"JS execution result length: {len(result['result']['markdown'])}")
     assert result["result"]["success"]
 
+
 def test_css_selector(tester: Crawl4AiTester):
     print("\n=== Testing CSS Selector ===")
     request = {
         "urls": "https://www.nbcnews.com/business",
         "priority": 7,
         "css_selector": ".wide-tease-item__description",
-        "crawler_params": {
-            "headless": True
-        },
-        "extra": {"word_count_threshold": 10}
-        
+        "crawler_params": {"headless": True},
+        "extra": {"word_count_threshold": 10},
     }
-    
+
     result = tester.submit_and_wait(request)
     print(f"CSS selector result length: {len(result['result']['markdown'])}")
     assert result["result"]["success"]
 
+
 def test_structured_extraction(tester: Crawl4AiTester):
     print("\n=== Testing Structured Extraction ===")
     schema = {
@@ -190,21 +207,16 @@ def test_structured_extraction(tester: Crawl4AiTester):
                 "name": "price",
                 "selector": "td:nth-child(2)",
                 "type": "text",
-            }
+            },
         ],
     }
-    
+
     request = {
         "urls": "https://www.coinbase.com/explore",
         "priority": 9,
-        "extraction_config": {
-            "type": "json_css",
-            "params": {
-                "schema": schema
-            }
-        }
+        "extraction_config": {"type": "json_css", "params": {"schema": schema}},
     }
-    
+
     result = tester.submit_and_wait(request)
     extracted = json.loads(result["result"]["extracted_content"])
     print(f"Extracted {len(extracted)} items")
@@ -212,6 +224,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
     assert result["result"]["success"]
     assert len(extracted) > 0
 
+
 def test_llm_extraction(tester: Crawl4AiTester):
     print("\n=== Testing LLM Extraction ===")
     schema = {
@@ -219,20 +232,20 @@ def test_llm_extraction(tester: Crawl4AiTester):
         "properties": {
             "model_name": {
                 "type": "string",
-                "description": "Name of the OpenAI model."
+                "description": "Name of the OpenAI model.",
             },
             "input_fee": {
                 "type": "string",
-                "description": "Fee for input token for the OpenAI model."
+                "description": "Fee for input token for the OpenAI model.",
             },
             "output_fee": {
                 "type": "string",
-                "description": "Fee for output token for the OpenAI model."
-            }
+                "description": "Fee for output token for the OpenAI model.",
+            },
         },
-        "required": ["model_name", "input_fee", "output_fee"]
+        "required": ["model_name", "input_fee", "output_fee"],
     }
-    
+
     request = {
         "urls": "https://openai.com/api/pricing",
         "priority": 8,
@@ -243,12 +256,12 @@ def test_llm_extraction(tester: Crawl4AiTester):
                 "api_token": os.getenv("OPENAI_API_KEY"),
                 "schema": schema,
                 "extraction_type": "schema",
-                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
-            }
+                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""",
+            },
         },
-        "crawler_params": {"word_count_threshold": 1}
+        "crawler_params": {"word_count_threshold": 1},
     }
-    
+
     try:
         result = tester.submit_and_wait(request)
         extracted = json.loads(result["result"]["extracted_content"])
@@ -258,6 +271,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
     except Exception as e:
         print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")
 
+
 def test_llm_with_ollama(tester: Crawl4AiTester):
     print("\n=== Testing LLM with Ollama ===")
     schema = {
@@ -265,20 +279,20 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
         "properties": {
             "article_title": {
                 "type": "string",
-                "description": "The main title of the news article"
+                "description": "The main title of the news article",
             },
             "summary": {
                 "type": "string",
-                "description": "A brief summary of the article content"
+                "description": "A brief summary of the article content",
             },
             "main_topics": {
                 "type": "array",
                 "items": {"type": "string"},
-                "description": "Main topics or themes discussed in the article"
-            }
-        }
+                "description": "Main topics or themes discussed in the article",
+            },
+        },
     }
-    
+
     request = {
         "urls": "https://www.nbcnews.com/business",
         "priority": 8,
@@ -288,13 +302,13 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
                 "provider": "ollama/llama2",
                 "schema": schema,
                 "extraction_type": "schema",
-                "instruction": "Extract the main article information including title, summary, and main topics."
-            }
+                "instruction": "Extract the main article information including title, summary, and main topics.",
+            },
         },
         "extra": {"word_count_threshold": 1},
-        "crawler_params": {"verbose": True}
+        "crawler_params": {"verbose": True},
     }
-    
+
     try:
         result = tester.submit_and_wait(request)
         extracted = json.loads(result["result"]["extracted_content"])
@@ -303,6 +317,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
     except Exception as e:
         print(f"Ollama extraction test failed: {str(e)}")
 
+
 def test_cosine_extraction(tester: Crawl4AiTester):
     print("\n=== Testing Cosine Extraction ===")
     request = {
@@ -314,11 +329,11 @@ def test_cosine_extraction(tester: Crawl4AiTester):
                 "semantic_filter": "business finance economy",
                 "word_count_threshold": 10,
                 "max_dist": 0.2,
-                "top_k": 3
-            }
-        }
+                "top_k": 3,
+            },
+        },
     }
-    
+
     try:
         result = tester.submit_and_wait(request)
         extracted = json.loads(result["result"]["extracted_content"])
@@ -328,30 +343,30 @@ def test_cosine_extraction(tester: Crawl4AiTester):
     except Exception as e:
         print(f"Cosine extraction test failed: {str(e)}")
 
+
 def test_screenshot(tester: Crawl4AiTester):
     print("\n=== Testing Screenshot ===")
     request = {
         "urls": "https://www.nbcnews.com/business",
         "priority": 5,
         "screenshot": True,
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
     }
-    
+
     result = tester.submit_and_wait(request)
     print("Screenshot captured:", bool(result["result"]["screenshot"]))
-    
+
     if result["result"]["screenshot"]:
         # Save screenshot
         screenshot_data = base64.b64decode(result["result"]["screenshot"])
         with open("test_screenshot.jpg", "wb") as f:
             f.write(screenshot_data)
         print("Screenshot saved as test_screenshot.jpg")
-    
+
     assert result["result"]["success"]
 
+
 if __name__ == "__main__":
     version = sys.argv[1] if len(sys.argv) > 1 else "basic"
     # version = "full"
-    test_docker_deployment(version)
\ No newline at end of file
+    test_docker_deployment(version)
diff --git a/docs/examples/extraction_strategies_example.py b/docs/examples/extraction_strategies_example.py
index 348b891e..658f7521 100644
--- a/docs/examples/extraction_strategies_example.py
+++ b/docs/examples/extraction_strategies_example.py
@@ -9,18 +9,17 @@ This example shows how to:
 
 import asyncio
 import os
-from typing import Dict, Any
 
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from crawl4ai.extraction_strategy import (
     LLMExtractionStrategy,
     JsonCssExtractionStrategy,
-    JsonXPathExtractionStrategy
+    JsonXPathExtractionStrategy,
 )
-from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking
 from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 
+
 async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
     """Helper function to run extraction with proper configuration"""
     try:
@@ -30,78 +29,90 @@ async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str
             extraction_strategy=strategy,
             markdown_generator=DefaultMarkdownGenerator(
                 content_filter=PruningContentFilter()  # For fit_markdown support
-            )
+            ),
         )
-        
+
         # Run the crawler
         result = await crawler.arun(url=url, config=config)
-        
+
         if result.success:
             print(f"\n=== {name} Results ===")
             print(f"Extracted Content: {result.extracted_content}")
             print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
-            print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")
+            print(
+                f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}"
+            )
         else:
             print(f"Error in {name}: Crawl failed")
-            
+
     except Exception as e:
         print(f"Error in {name}: {str(e)}")
 
+
 async def main():
     # Example URL (replace with actual URL)
     url = "https://example.com/product-page"
-    
+
     # Configure browser settings
-    browser_config = BrowserConfig(
-        headless=True,
-        verbose=True
-    )
-    
+    browser_config = BrowserConfig(headless=True, verbose=True)
+
     # Initialize extraction strategies
-    
+
     # 1. LLM Extraction with different input formats
     markdown_strategy = LLMExtractionStrategy(
         provider="openai/gpt-4o-mini",
         api_token=os.getenv("OPENAI_API_KEY"),
-        instruction="Extract product information including name, price, and description"
+        instruction="Extract product information including name, price, and description",
     )
-    
+
     html_strategy = LLMExtractionStrategy(
         input_format="html",
         provider="openai/gpt-4o-mini",
         api_token=os.getenv("OPENAI_API_KEY"),
-        instruction="Extract product information from HTML including structured data"
+        instruction="Extract product information from HTML including structured data",
     )
-    
+
     fit_markdown_strategy = LLMExtractionStrategy(
         input_format="fit_markdown",
         provider="openai/gpt-4o-mini",
         api_token=os.getenv("OPENAI_API_KEY"),
-        instruction="Extract product information from cleaned markdown"
+        instruction="Extract product information from cleaned markdown",
     )
-    
+
     # 2. JSON CSS Extraction (automatically uses HTML input)
     css_schema = {
         "baseSelector": ".product",
         "fields": [
             {"name": "title", "selector": "h1.product-title", "type": "text"},
             {"name": "price", "selector": ".price", "type": "text"},
-            {"name": "description", "selector": ".description", "type": "text"}
-        ]
+            {"name": "description", "selector": ".description", "type": "text"},
+        ],
     }
     css_strategy = JsonCssExtractionStrategy(schema=css_schema)
-    
+
     # 3. JSON XPath Extraction (automatically uses HTML input)
     xpath_schema = {
         "baseSelector": "//div[@class='product']",
         "fields": [
-            {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
-            {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
-            {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}
-        ]
+            {
+                "name": "title",
+                "selector": ".//h1[@class='product-title']/text()",
+                "type": "text",
+            },
+            {
+                "name": "price",
+                "selector": ".//span[@class='price']/text()",
+                "type": "text",
+            },
+            {
+                "name": "description",
+                "selector": ".//div[@class='description']/text()",
+                "type": "text",
+            },
+        ],
     }
     xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
-    
+
     # Use context manager for proper resource handling
     async with AsyncWebCrawler(config=browser_config) as crawler:
         # Run all strategies
@@ -111,5 +122,6 @@ async def main():
         await run_extraction(crawler, url, css_strategy, "CSS Extraction")
         await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")
 
+
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py
index 18534d0e..97a8187e 100644
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -1,20 +1,23 @@
 import asyncio
 from crawl4ai import *
 
+
 async def main():
     browser_config = BrowserConfig(headless=True, verbose=True)
     async with AsyncWebCrawler(config=browser_config) as crawler:
         crawler_config = CrawlerRunConfig(
             cache_mode=CacheMode.BYPASS,
             markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
-            )
+                content_filter=PruningContentFilter(
+                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                )
+            ),
         )
         result = await crawler.arun(
-            url="https://www.helloworld.org",
-            config=crawler_config
+            url="https://www.helloworld.org", config=crawler_config
         )
         print(result.markdown_v2.raw_markdown[:500])
 
+
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/docs/examples/hooks_example.py b/docs/examples/hooks_example.py
index 09e0bc17..de0aa6e1 100644
--- a/docs/examples/hooks_example.py
+++ b/docs/examples/hooks_example.py
@@ -1,19 +1,18 @@
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from playwright.async_api import Page, BrowserContext
 
+
 async def main():
     print("🔗 Hooks Example: Demonstrating different hook use cases")
 
     # Configure browser settings
-    browser_config = BrowserConfig(
-        headless=True
-    )
-    
+    browser_config = BrowserConfig(headless=True)
+
     # Configure crawler settings
     crawler_run_config = CrawlerRunConfig(
         js_code="window.scrollTo(0, document.body.scrollHeight);",
         wait_for="body",
-        cache_mode=CacheMode.BYPASS
+        cache_mode=CacheMode.BYPASS,
     )
 
     # Create crawler instance
@@ -30,16 +29,22 @@ async def main():
         """Hook called after a new page and context are created"""
         print("[HOOK] on_page_context_created - New page created!")
         # Example: Set default viewport size
-        await context.add_cookies([{
-            'name': 'session_id',
-            'value': 'example_session',
-            'domain': '.example.com',
-            'path': '/'
-        }])
-        await page.set_viewport_size({"width": 1920, "height": 1080})
+        await context.add_cookies(
+            [
+                {
+                    "name": "session_id",
+                    "value": "example_session",
+                    "domain": ".example.com",
+                    "path": "/",
+                }
+            ]
+        )
+        await page.set_viewport_size({"width": 1080, "height": 800})
         return page
 
-    async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs):
+    async def on_user_agent_updated(
+        page: Page, context: BrowserContext, user_agent: str, **kwargs
+    ):
         """Hook called when the user agent is updated"""
         print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
         return page
@@ -53,17 +58,17 @@ async def main():
         """Hook called before navigating to each URL"""
         print(f"[HOOK] before_goto - About to visit: {url}")
         # Example: Add custom headers for the request
-        await page.set_extra_http_headers({
-            "Custom-Header": "my-value"
-        })
+        await page.set_extra_http_headers({"Custom-Header": "my-value"})
         return page
 
-    async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
+    async def after_goto(
+        page: Page, context: BrowserContext, url: str, response: dict, **kwargs
+    ):
         """Hook called after navigating to each URL"""
         print(f"[HOOK] after_goto - Successfully loaded: {url}")
         # Example: Wait for a specific element to be loaded
         try:
-            await page.wait_for_selector('.content', timeout=1000)
+            await page.wait_for_selector(".content", timeout=1000)
             print("Content element found!")
         except:
             print("Content element not found, continuing anyway")
@@ -76,7 +81,9 @@ async def main():
         await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
         return page
 
-    async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs):
+    async def before_return_html(
+        page: Page, context: BrowserContext, html: str, **kwargs
+    ):
         """Hook called before returning the HTML content"""
         print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
         # Example: You could modify the HTML content here if needed
@@ -84,7 +91,9 @@ async def main():
 
     # Set all the hooks
     crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
-    crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
+    crawler.crawler_strategy.set_hook(
+        "on_page_context_created", on_page_context_created
+    )
     crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
     crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
     crawler.crawler_strategy.set_hook("before_goto", before_goto)
@@ -95,13 +104,15 @@ async def main():
     await crawler.start()
 
     # Example usage: crawl a simple website
-    url = 'https://example.com'
+    url = "https://example.com"
     result = await crawler.arun(url, config=crawler_run_config)
     print(f"\nCrawled URL: {result.url}")
     print(f"HTML length: {len(result.html)}")
-    
+
     await crawler.close()
 
+
 if __name__ == "__main__":
     import asyncio
-    asyncio.run(main())
\ No newline at end of file
+
+    asyncio.run(main())
diff --git a/docs/examples/language_support_example.py b/docs/examples/language_support_example.py
index b74a8402..712db2c4 100644
--- a/docs/examples/language_support_example.py
+++ b/docs/examples/language_support_example.py
@@ -1,6 +1,7 @@
 import asyncio
 from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy
 
+
 async def main():
     # Example 1: Setting language when creating the crawler
     crawler1 = AsyncWebCrawler(
@@ -9,11 +10,15 @@ async def main():
         )
     )
     result1 = await crawler1.arun("https://www.example.com")
-    print("Example 1 result:", result1.extracted_content[:100])  # Print first 100 characters
+    print(
+        "Example 1 result:", result1.extracted_content[:100]
+    )  # Print first 100 characters
 
     # Example 2: Setting language before crawling
     crawler2 = AsyncWebCrawler()
-    crawler2.crawler_strategy.headers["Accept-Language"] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
+    crawler2.crawler_strategy.headers[
+        "Accept-Language"
+    ] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
     result2 = await crawler2.arun("https://www.example.com")
     print("Example 2 result:", result2.extracted_content[:100])
 
@@ -21,7 +26,7 @@ async def main():
     crawler3 = AsyncWebCrawler()
     result3 = await crawler3.arun(
         "https://www.example.com",
-        headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"}
+        headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"},
     )
     print("Example 3 result:", result3.extracted_content[:100])
 
@@ -31,15 +36,15 @@ async def main():
         ("https://www.example.org", "es-ES,es;q=0.9"),
         ("https://www.example.net", "de-DE,de;q=0.9"),
     ]
-    
+
     crawler4 = AsyncWebCrawler()
-    results = await asyncio.gather(*[
-        crawler4.arun(url, headers={"Accept-Language": lang})
-        for url, lang in urls
-    ])
-    
+    results = await asyncio.gather(
+        *[crawler4.arun(url, headers={"Accept-Language": lang}) for url, lang in urls]
+    )
+
     for url, result in zip([u for u, _ in urls], results):
         print(f"Result for {url}:", result.extracted_content[:100])
 
+
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py
index 5ae3d4d1..e9e90dd2 100644
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -3,32 +3,37 @@ from crawl4ai.crawler_strategy import *
 import asyncio
 from pydantic import BaseModel, Field
 
-url = r'https://openai.com/api/pricing/'
+url = r"https://openai.com/api/pricing/"
+
 
 class OpenAIModelFee(BaseModel):
     model_name: str = Field(..., description="Name of the OpenAI model.")
     input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )
+
 
 from crawl4ai import AsyncWebCrawler
 
+
 async def main():
     # Use AsyncWebCrawler
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
             url=url,
             word_count_threshold=1,
-            extraction_strategy= LLMExtractionStrategy(
+            extraction_strategy=LLMExtractionStrategy(
                 # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
-                provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),
+                provider="groq/llama-3.1-70b-versatile",
+                api_token=os.getenv("GROQ_API_KEY"),
                 schema=OpenAIModelFee.model_json_schema(),
                 extraction_type="schema",
-                instruction="From the crawled content, extract all mentioned model names along with their " \
-                            "fees for input and output tokens. Make sure not to miss anything in the entire content. " \
-                            'One extracted model JSON format should look like this: ' \
-                            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
+                instruction="From the crawled content, extract all mentioned model names along with their "
+                "fees for input and output tokens. Make sure not to miss anything in the entire content. "
+                "One extracted model JSON format should look like this: "
+                '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }',
             ),
-
         )
         print("Success:", result.success)
         model_fees = json.loads(result.extracted_content)
@@ -37,4 +42,5 @@ async def main():
         with open(".data/data.json", "w", encoding="utf-8") as f:
             f.write(result.extracted_content)
 
+
 asyncio.run(main())
diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py
index 4c4a9d86..b58443bd 100644
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -8,12 +8,12 @@ import asyncio
 import time
 import json
 import re
-from typing import Dict, List
+from typing import Dict
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
+from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.extraction_strategy import (
     JsonCssExtractionStrategy,
     LLMExtractionStrategy,
@@ -62,6 +62,7 @@ async def clean_content():
         print(f"Full Markdown Length: {full_markdown_length}")
         print(f"Fit Markdown Length: {fit_markdown_length}")
 
+
 async def link_analysis():
     crawler_config = CrawlerRunConfig(
         cache_mode=CacheMode.ENABLED,
@@ -76,9 +77,10 @@ async def link_analysis():
         print(f"Found {len(result.links['internal'])} internal links")
         print(f"Found {len(result.links['external'])} external links")
 
-        for link in result.links['internal'][:5]:
+        for link in result.links["internal"][:5]:
             print(f"Href: {link['href']}\nText: {link['text']}\n")
 
+
 # JavaScript Execution Example
 async def simple_example_with_running_js_code():
     print("\n--- Executing JavaScript and Using CSS Selectors ---")
@@ -112,25 +114,29 @@ async def simple_example_with_css_selector():
         )
         print(result.markdown[:500])
 
+
 async def media_handling():
-    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
+    )
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            config=crawler_config
+            url="https://www.nbcnews.com/business", config=crawler_config
         )
-        for img in result.media['images'][:5]:
+        for img in result.media["images"][:5]:
             print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
 
+
 async def custom_hook_workflow(verbose=True):
     async with AsyncWebCrawler() as crawler:
         # Set a 'before_goto' hook to run custom code just before navigation
-        crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate..."))
+        crawler.crawler_strategy.set_hook(
+            "before_goto",
+            lambda page, context: print("[Hook] Preparing to navigate..."),
+        )
 
         # Perform the crawl operation
-        result = await crawler.arun(
-            url="https://crawl4ai.com"
-        )
+        result = await crawler.arun(url="https://crawl4ai.com")
         print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))
 
 
@@ -225,7 +231,7 @@ async def extract_structured_data_using_css_extractor():
     print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
     schema = {
         "name": "KidoCode Courses",
-        "baseSelector": "section.charge-methodology .w-tab-content > div",
+        "baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
         "fields": [
             {
                 "name": "section_title",
@@ -273,6 +279,7 @@ async def extract_structured_data_using_css_extractor():
         cache_mode=CacheMode.BYPASS,
         extraction_strategy=JsonCssExtractionStrategy(schema),
         js_code=[js_click_tabs],
+        delay_before_return_html=1
     )
 
     async with AsyncWebCrawler(config=browser_config) as crawler:
@@ -412,21 +419,22 @@ async def cosine_similarity_extraction():
         cache_mode=CacheMode.BYPASS,
         extraction_strategy=CosineStrategy(
             word_count_threshold=10,
-            max_dist=0.2, # Maximum distance between two words
-            linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
-            top_k=3, # Number of top keywords to extract
-            sim_threshold=0.3, # Similarity threshold for clustering
-            semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
-            verbose=True
-        ),        
+            max_dist=0.2,  # Maximum distance between two words
+            linkage_method="ward",  # Linkage method for hierarchical clustering (ward, complete, average, single)
+            top_k=3,  # Number of top keywords to extract
+            sim_threshold=0.3,  # Similarity threshold for clustering
+            semantic_filter="McDonald's economic impact, American consumer trends",  # Keywords to filter the content semantically using embeddings
+            verbose=True,
+        ),
     )
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
             url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
-            config=crawl_config
+            config=crawl_config,
         )
         print(json.loads(result.extracted_content)[:5])
 
+
 # Browser Comparison
 async def crawl_custom_browser_type():
     print("\n--- Browser Comparison ---")
@@ -484,39 +492,42 @@ async def crawl_with_user_simulation():
         result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
         print(result.markdown)
 
+
 async def ssl_certification():
     # Configure crawler to fetch SSL certificate
     config = CrawlerRunConfig(
         fetch_ssl_certificate=True,
-        cache_mode=CacheMode.BYPASS  # Bypass cache to always get fresh certificates
+        cache_mode=CacheMode.BYPASS,  # Bypass cache to always get fresh certificates
     )
 
     async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url='https://example.com',
-            config=config
-        )
-        
+        result = await crawler.arun(url="https://example.com", config=config)
+
         if result.success and result.ssl_certificate:
             cert = result.ssl_certificate
-            
+
             # 1. Access certificate properties directly
             print("\nCertificate Information:")
             print(f"Issuer: {cert.issuer.get('CN', '')}")
             print(f"Valid until: {cert.valid_until}")
             print(f"Fingerprint: {cert.fingerprint}")
-            
+
             # 2. Export certificate in different formats
             cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
             print("\nCertificate exported to:")
             print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
-            
-            pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))  # For web servers
+
+            pem_data = cert.to_pem(
+                os.path.join(tmp_dir, "certificate.pem")
+            )  # For web servers
             print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
-            
-            der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der"))  # For Java apps
+
+            der_data = cert.to_der(
+                os.path.join(tmp_dir, "certificate.der")
+            )  # For Java apps
             print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
 
+
 # Speed Comparison
 async def speed_comparison():
     print("\n--- Speed Comparison ---")
@@ -581,29 +592,26 @@ async def speed_comparison():
 # Main execution
 async def main():
     # Basic examples
-    # await simple_crawl()
-    # await simple_example_with_running_js_code()
-    # await simple_example_with_css_selector()
+    await simple_crawl()
+    await simple_example_with_running_js_code()
+    await simple_example_with_css_selector()
 
     # Advanced examples
-    # await extract_structured_data_using_css_extractor()
+    await extract_structured_data_using_css_extractor()
     await extract_structured_data_using_llm(
         "openai/gpt-4o", os.getenv("OPENAI_API_KEY")
     )
-    # await crawl_dynamic_content_pages_method_1()
-    # await crawl_dynamic_content_pages_method_2()
+    await crawl_dynamic_content_pages_method_1()
+    await crawl_dynamic_content_pages_method_2()
 
     # Browser comparisons
-    # await crawl_custom_browser_type()
-
-    # Performance testing
-    # await speed_comparison()
+    await crawl_custom_browser_type()
 
     # Screenshot example
-    # await capture_and_save_screenshot(
-    #     "https://www.example.com",
-    #     os.path.join(__location__, "tmp/example_screenshot.jpg")
-    # )
+    await capture_and_save_screenshot(
+        "https://www.example.com",
+        os.path.join(__location__, "tmp/example_screenshot.jpg")
+    )
 
 
 if __name__ == "__main__":
diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py
index e640e6bd..1585ebea 100644
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -1,6 +1,10 @@
 import os, sys
+
 # append parent directory to system path
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))); os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692";
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+os.environ["FIRECRAWL_API_KEY"] = "fc-84b370ccfad44beabc686b38f1769692"
 
 import asyncio
 # import nest_asyncio
@@ -15,7 +19,7 @@ from bs4 import BeautifulSoup
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
+from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.extraction_strategy import (
     JsonCssExtractionStrategy,
     LLMExtractionStrategy,
@@ -32,9 +36,12 @@ print("Website: https://crawl4ai.com")
 async def simple_crawl():
     print("\n--- Basic Usage ---")
     async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS)
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
+        )
         print(result.markdown[:500])  # Print first 500 characters
 
+
 async def simple_example_with_running_js_code():
     print("\n--- Executing JavaScript and Using CSS Selectors ---")
     # New code to handle the wait_for parameter
@@ -57,6 +64,7 @@ async def simple_example_with_running_js_code():
         )
         print(result.markdown[:500])  # Print first 500 characters
 
+
 async def simple_example_with_css_selector():
     print("\n--- Using CSS Selectors ---")
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -67,42 +75,44 @@ async def simple_example_with_css_selector():
         )
         print(result.markdown[:500])  # Print first 500 characters
 
+
 async def use_proxy():
     print("\n--- Using a Proxy ---")
     print(
         "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
     )
     # Uncomment and modify the following lines to use a proxy
-    async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
+    async with AsyncWebCrawler(
+        verbose=True, proxy="http://your-proxy-url:port"
+    ) as crawler:
         result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            cache_mode= CacheMode.BYPASS
+            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
         )
         if result.success:
             print(result.markdown[:500])  # Print first 500 characters
 
+
 async def capture_and_save_screenshot(url: str, output_path: str):
     async with AsyncWebCrawler(verbose=True) as crawler:
         result = await crawler.arun(
-            url=url,
-            screenshot=True,
-            cache_mode= CacheMode.BYPASS
+            url=url, screenshot=True, cache_mode=CacheMode.BYPASS
         )
-        
+
         if result.success and result.screenshot:
             import base64
-            
+
             # Decode the base64 screenshot data
             screenshot_data = base64.b64decode(result.screenshot)
-            
+
             # Save the screenshot as a JPEG file
-            with open(output_path, 'wb') as f:
+            with open(output_path, "wb") as f:
                 f.write(screenshot_data)
-            
+
             print(f"Screenshot saved successfully to {output_path}")
         else:
             print("Failed to capture screenshot")
 
+
 class OpenAIModelFee(BaseModel):
     model_name: str = Field(..., description="Name of the OpenAI model.")
     input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
@@ -110,16 +120,19 @@ class OpenAIModelFee(BaseModel):
         ..., description="Fee for output token for the OpenAI model."
     )
 
-async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
     print(f"\n--- Extracting Structured Data with {provider} ---")
-    
+
     if api_token is None and provider != "ollama":
         print(f"API token is required for {provider}. Skipping this example.")
         return
 
     # extra_args = {}
-    extra_args={
-        "temperature": 0, 
+    extra_args = {
+        "temperature": 0,
         "top_p": 0.9,
         "max_tokens": 2000,
         # any other supported parameters for litellm
@@ -139,52 +152,49 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
                 instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
                 Do not miss any models in the entire content. One extracted model JSON format should look like this: 
                 {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
-                extra_args=extra_args
+                extra_args=extra_args,
             ),
             cache_mode=CacheMode.BYPASS,
         )
         print(result.extracted_content)
 
+
 async def extract_structured_data_using_css_extractor():
     print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
     schema = {
-    "name": "KidoCode Courses",
-    "baseSelector": "section.charge-methodology .w-tab-content > div",
-    "fields": [
-        {
-            "name": "section_title",
-            "selector": "h3.heading-50",
-            "type": "text",
-        },
-        {
-            "name": "section_description",
-            "selector": ".charge-content",
-            "type": "text",
-        },
-        {
-            "name": "course_name",
-            "selector": ".text-block-93",
-            "type": "text",
-        },
-        {
-            "name": "course_description",
-            "selector": ".course-content-text",
-            "type": "text",
-        },
-        {
-            "name": "course_icon",
-            "selector": ".image-92",
-            "type": "attribute",
-            "attribute": "src"
-        }
-    ]
-}
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .w-tab-content > div",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }
 
-    async with AsyncWebCrawler(
-        headless=True,
-        verbose=True
-    ) as crawler:
-        
+    async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
         # Create the JavaScript that handles clicking multiple times
         js_click_tabs = """
         (async () => {
@@ -198,19 +208,20 @@ async def extract_structured_data_using_css_extractor():
                 await new Promise(r => setTimeout(r, 500));
             }
         })();
-        """     
+        """
 
         result = await crawler.arun(
             url="https://www.kidocode.com/degrees/technology",
             extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
             js_code=[js_click_tabs],
-            cache_mode=CacheMode.BYPASS
+            cache_mode=CacheMode.BYPASS,
         )
 
         companies = json.loads(result.extracted_content)
         print(f"Successfully extracted {len(companies)} companies")
         print(json.dumps(companies[0], indent=2))
 
+
 # Advanced Session-Based Crawling with Dynamic Content 🔄
 async def crawl_dynamic_content_pages_method_1():
     print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
@@ -267,6 +278,7 @@ async def crawl_dynamic_content_pages_method_1():
         await crawler.crawler_strategy.kill_session(session_id)
         print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
 
+
 async def crawl_dynamic_content_pages_method_2():
     print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
 
@@ -334,8 +346,11 @@ async def crawl_dynamic_content_pages_method_2():
         await crawler.crawler_strategy.kill_session(session_id)
         print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
 
+
 async def crawl_dynamic_content_pages_method_3():
-    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---")
+    print(
+        "\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---"
+    )
 
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://github.com/microsoft/TypeScript/commits/main"
@@ -357,7 +372,7 @@ async def crawl_dynamic_content_pages_method_3():
             const firstCommit = commits[0].textContent.trim();
             return firstCommit !== window.firstCommit;
         }"""
-        
+
         schema = {
             "name": "Commit Extractor",
             "baseSelector": "li.Box-sc-g0xbh4-0",
@@ -395,40 +410,53 @@ async def crawl_dynamic_content_pages_method_3():
         await crawler.crawler_strategy.kill_session(session_id)
         print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
 
+
 async def crawl_custom_browser_type():
     # Use Firefox
     start = time.time()
-    async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
+    async with AsyncWebCrawler(
+        browser_type="firefox", verbose=True, headless=True
+    ) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com", cache_mode=CacheMode.BYPASS
+        )
         print(result.markdown[:500])
         print("Time taken: ", time.time() - start)
 
     # Use WebKit
     start = time.time()
-    async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
+    async with AsyncWebCrawler(
+        browser_type="webkit", verbose=True, headless=True
+    ) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com", cache_mode=CacheMode.BYPASS
+        )
         print(result.markdown[:500])
         print("Time taken: ", time.time() - start)
 
     # Use Chromium (default)
     start = time.time()
-    async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
+    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com", cache_mode=CacheMode.BYPASS
+        )
         print(result.markdown[:500])
         print("Time taken: ", time.time() - start)
 
+
 async def crawl_with_user_simultion():
     async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
         url = "YOUR-URL-HERE"
         result = await crawler.arun(
-            url=url,            
+            url=url,
             cache_mode=CacheMode.BYPASS,
-            magic = True, # Automatically detects and removes overlays, popups, and other elements that block content
+            magic=True,  # Automatically detects and removes overlays, popups, and other elements that block content
             # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
             # override_navigator = True # Overrides the navigator object to make it look like a real user
         )
-        
-        print(result.markdown)    
+
+        print(result.markdown)
+
 
 async def speed_comparison():
     # print("\n--- Speed Comparison ---")
@@ -439,18 +467,18 @@ async def speed_comparison():
     # print()
     # Simulated Firecrawl performance
     from firecrawl import FirecrawlApp
-    app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
+
+    app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
     start = time.time()
     scrape_status = app.scrape_url(
-    'https://www.nbcnews.com/business',
-    params={'formats': ['markdown', 'html']}
+        "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
     )
     end = time.time()
     print("Firecrawl:")
     print(f"Time taken: {end - start:.2f} seconds")
     print(f"Content length: {len(scrape_status['markdown'])} characters")
     print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
-    print()    
+    print()
 
     async with AsyncWebCrawler() as crawler:
         # Crawl4AI simple crawl
@@ -474,7 +502,9 @@ async def speed_comparison():
             url="https://www.nbcnews.com/business",
             word_count_threshold=0,
             markdown_generator=DefaultMarkdownGenerator(
-                content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+                content_filter=PruningContentFilter(
+                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                )
                 # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
             ),
             cache_mode=CacheMode.BYPASS,
@@ -498,7 +528,9 @@ async def speed_comparison():
             word_count_threshold=0,
             cache_mode=CacheMode.BYPASS,
             markdown_generator=DefaultMarkdownGenerator(
-                content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+                content_filter=PruningContentFilter(
+                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                )
                 # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
             ),
             verbose=False,
@@ -520,11 +552,12 @@ async def speed_comparison():
     print("If you run these tests in an environment with better network conditions,")
     print("you may observe an even more significant speed advantage for Crawl4AI.")
 
+
 async def generate_knowledge_graph():
     class Entity(BaseModel):
         name: str
         description: str
-        
+
     class Relationship(BaseModel):
         entity1: Entity
         entity2: Entity
@@ -536,11 +569,11 @@ async def generate_knowledge_graph():
         relationships: List[Relationship]
 
     extraction_strategy = LLMExtractionStrategy(
-            provider='openai/gpt-4o-mini', # Or any other provider, including Ollama and open source models
-            api_token=os.getenv('OPENAI_API_KEY'), # In case of Ollama just pass "no-token"
-            schema=KnowledgeGraph.model_json_schema(),
-            extraction_type="schema",
-            instruction="""Extract entities and relationships from the given text."""
+        provider="openai/gpt-4o-mini",  # Or any other provider, including Ollama and open source models
+        api_token=os.getenv("OPENAI_API_KEY"),  # In case of Ollama just pass "no-token"
+        schema=KnowledgeGraph.model_json_schema(),
+        extraction_type="schema",
+        instruction="""Extract entities and relationships from the given text.""",
     )
     async with AsyncWebCrawler() as crawler:
         url = "https://paulgraham.com/love.html"
@@ -554,27 +587,22 @@ async def generate_knowledge_graph():
         with open(os.path.join(__location__, "kb.json"), "w") as f:
             f.write(result.extracted_content)
 
+
 async def fit_markdown_remove_overlay():
-    
     async with AsyncWebCrawler(
-            headless=True,  # Set to False to see what is happening
-            verbose=True,
-            user_agent_mode="random",
-            user_agent_generator_config={
-                "device_type": "mobile",
-                "os_type": "android"
-            },
+        headless=True,  # Set to False to see what is happening
+        verbose=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
     ) as crawler:
         result = await crawler.arun(
-            url='https://www.kidocode.com/degrees/technology',
+            url="https://www.kidocode.com/degrees/technology",
             cache_mode=CacheMode.BYPASS,
             markdown_generator=DefaultMarkdownGenerator(
                 content_filter=PruningContentFilter(
                     threshold=0.48, threshold_type="fixed", min_word_threshold=0
                 ),
-                options={
-                    "ignore_links": True
-                }
+                options={"ignore_links": True},
             ),
             # markdown_generator=DefaultMarkdownGenerator(
             #     content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
@@ -583,31 +611,38 @@ async def fit_markdown_remove_overlay():
             #     }
             # ),
         )
-        
+
         if result.success:
             print(len(result.markdown_v2.raw_markdown))
             print(len(result.markdown_v2.markdown_with_citations))
             print(len(result.markdown_v2.fit_markdown))
-            
+
             # Save clean html
             with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
                 f.write(result.cleaned_html)
-            
-            with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f:
+
+            with open(
+                os.path.join(__location__, "output/output_raw_markdown.md"), "w"
+            ) as f:
                 f.write(result.markdown_v2.raw_markdown)
-                
-            with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f:
-                f.write(result.markdown_v2.markdown_with_citations) 
-                
-            with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f:   
+
+            with open(
+                os.path.join(__location__, "output/output_markdown_with_citations.md"),
+                "w",
+            ) as f:
+                f.write(result.markdown_v2.markdown_with_citations)
+
+            with open(
+                os.path.join(__location__, "output/output_fit_markdown.md"), "w"
+            ) as f:
                 f.write(result.markdown_v2.fit_markdown)
-        
+
     print("Done")
 
 
 async def main():
     # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-    
+
     # await simple_crawl()
     # await simple_example_with_running_js_code()
     # await simple_example_with_css_selector()
@@ -618,7 +653,7 @@ async def main():
     # LLM extraction examples
     # await extract_structured_data_using_llm()
     # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
-    # await extract_structured_data_using_llm("ollama/llama3.2")    
+    # await extract_structured_data_using_llm("ollama/llama3.2")
 
     # You always can pass custom headers to the extraction strategy
     # custom_headers = {
@@ -626,13 +661,13 @@ async def main():
     #     "X-Custom-Header": "Some-Value"
     # }
     # await extract_structured_data_using_llm(extra_headers=custom_headers)
-    
+
     # await crawl_dynamic_content_pages_method_1()
     # await crawl_dynamic_content_pages_method_2()
     await crawl_dynamic_content_pages_method_3()
-    
+
     # await crawl_custom_browser_type()
-    
+
     # await speed_comparison()
 
 
diff --git a/docs/examples/quickstart_sync.py b/docs/examples/quickstart_sync.py
index 89c63139..0248af29 100644
--- a/docs/examples/quickstart_sync.py
+++ b/docs/examples/quickstart_sync.py
@@ -10,15 +10,17 @@ from functools import lru_cache
 
 console = Console()
 
+
 @lru_cache()
 def create_crawler():
     crawler = WebCrawler(verbose=True)
     crawler.warmup()
     return crawler
 
+
 def print_result(result):
     # Print each key in one line and just the first 10 characters of each one's value and three dots
-    console.print(f"\t[bold]Result:[/bold]")
+    console.print("\t[bold]Result:[/bold]")
     for key, value in result.model_dump().items():
         if isinstance(value, str) and value:
             console.print(f"\t{key}: [green]{value[:20]}...[/green]")
@@ -33,18 +35,27 @@ def cprint(message, press_any_key=False):
         console.print("Press any key to continue...", style="")
         input()
 
+
 def basic_usage(crawler):
-    cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
-    result = crawler.run(url="https://www.nbcnews.com/business", only_text = True)
+    cprint(
+        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
+    )
+    result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
     cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
     print_result(result)
 
+
 def basic_usage_some_params(crawler):
-    cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
-    result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True)
+    cprint(
+        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
+    )
+    result = crawler.run(
+        url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
+    )
     cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
     print_result(result)
 
+
 def screenshot_usage(crawler):
     cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
     result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
@@ -55,16 +66,23 @@ def screenshot_usage(crawler):
     cprint("Screenshot saved to 'screenshot.png'!")
     print_result(result)
 
+
 def understanding_parameters(crawler):
-    cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
-    cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
-    
+    cprint(
+        "\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
+    )
+    cprint(
+        "By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
+    )
+
     # First crawl (reads from cache)
     cprint("1️⃣ First crawl (caches the result):", True)
     start_time = time.time()
     result = crawler.run(url="https://www.nbcnews.com/business")
     end_time = time.time()
-    cprint(f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]")
+    cprint(
+        f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
+    )
     print_result(result)
 
     # Force to crawl again
@@ -72,169 +90,232 @@ def understanding_parameters(crawler):
     start_time = time.time()
     result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
     end_time = time.time()
-    cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
+    cprint(
+        f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
+    )
     print_result(result)
 
+
 def add_chunking_strategy(crawler):
     # Adding a chunking strategy: RegexChunking
-    cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
-    cprint("RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!")
+    cprint(
+        "\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
+        True,
+    )
+    cprint(
+        "RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
+    )
     result = crawler.run(
         url="https://www.nbcnews.com/business",
-        chunking_strategy=RegexChunking(patterns=["\n\n"])
+        chunking_strategy=RegexChunking(patterns=["\n\n"]),
     )
     cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
     print_result(result)
 
     # Adding another chunking strategy: NlpSentenceChunking
-    cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
-    cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
+    cprint(
+        "\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
+        True,
+    )
+    cprint(
+        "NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
+    )
     result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        chunking_strategy=NlpSentenceChunking()
+        url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
     )
     cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
     print_result(result)
 
+
 def add_extraction_strategy(crawler):
     # Adding an extraction strategy: CosineStrategy
-    cprint("\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True)
-    cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
+    cprint(
+        "\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
+        True,
+    )
+    cprint(
+        "CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
+    )
     result = crawler.run(
         url="https://www.nbcnews.com/business",
-        extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True)
+        extraction_strategy=CosineStrategy(
+            word_count_threshold=10,
+            max_dist=0.2,
+            linkage_method="ward",
+            top_k=3,
+            sim_threshold=0.3,
+            verbose=True,
+        ),
     )
     cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
     print_result(result)
-    
+
     # Using semantic_filter with CosineStrategy
-    cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!")
+    cprint(
+        "You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
+    )
     result = crawler.run(
         url="https://www.nbcnews.com/business",
         extraction_strategy=CosineStrategy(
             semantic_filter="inflation rent prices",
-        )
+        ),
+    )
+    cprint(
+        "[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
     )
-    cprint("[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]")
     print_result(result)
 
+
 def add_llm_extraction_strategy(crawler):
     # Adding an LLM extraction strategy without instructions
-    cprint("\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True)
-    cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!")
+    cprint(
+        "\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
+        True,
+    )
+    cprint(
+        "LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
+    )
     result = crawler.run(
         url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
+        extraction_strategy=LLMExtractionStrategy(
+            provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
+        ),
+    )
+    cprint(
+        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
     )
-    cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]")
     print_result(result)
-    
+
     # Adding an LLM extraction strategy with instructions
-    cprint("\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True)
-    cprint("Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!")
+    cprint(
+        "\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
+        True,
+    )
+    cprint(
+        "Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
+    )
     result = crawler.run(
         url="https://www.nbcnews.com/business",
         extraction_strategy=LLMExtractionStrategy(
             provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
-            instruction="I am interested in only financial news"
-        )
+            api_token=os.getenv("OPENAI_API_KEY"),
+            instruction="I am interested in only financial news",
+        ),
+    )
+    cprint(
+        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
     )
-    cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]")
     print_result(result)
-    
+
     result = crawler.run(
         url="https://www.nbcnews.com/business",
         extraction_strategy=LLMExtractionStrategy(
             provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
-            instruction="Extract only content related to technology"
-        )
+            api_token=os.getenv("OPENAI_API_KEY"),
+            instruction="Extract only content related to technology",
+        ),
+    )
+    cprint(
+        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
     )
-    cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]")
     print_result(result)
 
+
 def targeted_extraction(crawler):
     # Using a CSS selector to extract only H2 tags
-    cprint("\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]", True)
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        css_selector="h2"
+    cprint(
+        "\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
+        True,
     )
+    result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
     cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
     print_result(result)
 
+
 def interactive_extraction(crawler):
     # Passing JavaScript code to interact with the page
-    cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
-    cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
+    cprint(
+        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
+        True,
+    )
+    cprint(
+        "In this example we try to click the 'Load More' button on the page using JavaScript code."
+    )
     js_code = """
     const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
     loadMoreButton && loadMoreButton.click();
     """
     # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
     # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        js = js_code
+    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
+    cprint(
+        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
     )
-    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
     print_result(result)
 
+
 def multiple_scrip(crawler):
     # Passing JavaScript code to interact with the page
-    cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
-    cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
-    js_code = ["""
+    cprint(
+        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
+        True,
+    )
+    cprint(
+        "In this example we try to click the 'Load More' button on the page using JavaScript code."
+    )
+    js_code = [
+        """
     const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
     loadMoreButton && loadMoreButton.click();
-    """] * 2
+    """
+    ] * 2
     # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
     # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        js = js_code  
+    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
+    cprint(
+        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
     )
-    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
     print_result(result)
 
+
 def using_crawler_hooks(crawler):
     # Example usage of the hooks for authentication and setting a cookie
     def on_driver_created(driver):
         print("[HOOK] on_driver_created")
         # Example customization: maximize the window
         driver.maximize_window()
-        
+
         # Example customization: logging in to a hypothetical website
-        driver.get('https://example.com/login')
-        
+        driver.get("https://example.com/login")
+
         from selenium.webdriver.support.ui import WebDriverWait
         from selenium.webdriver.common.by import By
         from selenium.webdriver.support import expected_conditions as EC
-        
+
         WebDriverWait(driver, 10).until(
-            EC.presence_of_element_located((By.NAME, 'username'))
+            EC.presence_of_element_located((By.NAME, "username"))
         )
-        driver.find_element(By.NAME, 'username').send_keys('testuser')
-        driver.find_element(By.NAME, 'password').send_keys('password123')
-        driver.find_element(By.NAME, 'login').click()
+        driver.find_element(By.NAME, "username").send_keys("testuser")
+        driver.find_element(By.NAME, "password").send_keys("password123")
+        driver.find_element(By.NAME, "login").click()
         WebDriverWait(driver, 10).until(
-            EC.presence_of_element_located((By.ID, 'welcome'))
+            EC.presence_of_element_located((By.ID, "welcome"))
         )
         # Add a custom cookie
-        driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
-        return driver        
-        
+        driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
+        return driver
 
     def before_get_url(driver):
         print("[HOOK] before_get_url")
         # Example customization: add a custom header
         # Enable Network domain for sending headers
-        driver.execute_cdp_cmd('Network.enable', {})
+        driver.execute_cdp_cmd("Network.enable", {})
         # Add a custom header
-        driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
+        driver.execute_cdp_cmd(
+            "Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
+        )
         return driver
-    
+
     def after_get_url(driver):
         print("[HOOK] after_get_url")
         # Example customization: log the URL
@@ -246,48 +327,59 @@ def using_crawler_hooks(crawler):
         # Example customization: log the HTML
         print(len(html))
         return driver
-    
-    cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
-    
+
+    cprint(
+        "\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
+        True,
+    )
+
     crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-    crawler_strategy.set_hook('on_driver_created', on_driver_created)
-    crawler_strategy.set_hook('before_get_url', before_get_url)
-    crawler_strategy.set_hook('after_get_url', after_get_url)
-    crawler_strategy.set_hook('before_return_html', before_return_html)
-    
+    crawler_strategy.set_hook("on_driver_created", on_driver_created)
+    crawler_strategy.set_hook("before_get_url", before_get_url)
+    crawler_strategy.set_hook("after_get_url", after_get_url)
+    crawler_strategy.set_hook("before_return_html", before_return_html)
+
     crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
-    crawler.warmup()    
+    crawler.warmup()
     result = crawler.run(url="https://example.com")
-    
+
     cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
-    print_result(result= result)
-    
+    print_result(result=result)
+
+
 def using_crawler_hooks_dleay_example(crawler):
     def delay(driver):
         print("Delaying for 5 seconds...")
         time.sleep(5)
         print("Resuming...")
-        
+
     def create_crawler():
         crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-        crawler_strategy.set_hook('after_get_url', delay)
+        crawler_strategy.set_hook("after_get_url", delay)
         crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
         crawler.warmup()
         return crawler
 
-    cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
+    cprint(
+        "\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
+    )
     crawler = create_crawler()
-    result = crawler.run(url="https://google.com", bypass_cache=True)    
-    
+    result = crawler.run(url="https://google.com", bypass_cache=True)
+
     cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
     print_result(result)
-    
-    
+
 
 def main():
-    cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
-    cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
-    cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.")
+    cprint(
+        "🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
+    )
+    cprint(
+        "⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
+    )
+    cprint(
+        "If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
+    )
 
     crawler = create_crawler()
 
@@ -295,7 +387,7 @@ def main():
     basic_usage(crawler)
     # basic_usage_some_params(crawler)
     understanding_parameters(crawler)
-    
+
     crawler.always_by_pass_cache = True
     screenshot_usage(crawler)
     add_chunking_strategy(crawler)
@@ -305,8 +397,10 @@ def main():
     interactive_extraction(crawler)
     multiple_scrip(crawler)
 
-    cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
+    cprint(
+        "\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
+    )
+
 
 if __name__ == "__main__":
     main()
-
diff --git a/docs/examples/research_assistant.py b/docs/examples/research_assistant.py
index de35ce84..84ba3c76 100644
--- a/docs/examples/research_assistant.py
+++ b/docs/examples/research_assistant.py
@@ -11,7 +11,9 @@ from groq import Groq
 # Import threadpools to run the crawl_url function in a separate thread
 from concurrent.futures import ThreadPoolExecutor
 
-client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
+client = AsyncOpenAI(
+    base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY")
+)
 
 # Instrument the OpenAI client
 cl.instrument_openai()
@@ -25,41 +27,39 @@ settings = {
     "presence_penalty": 0,
 }
 
+
 def extract_urls(text):
-    url_pattern = re.compile(r'(https?://\S+)')
+    url_pattern = re.compile(r"(https?://\S+)")
     return url_pattern.findall(text)
 
+
 def crawl_url(url):
     data = {
         "urls": [url],
         "include_raw_html": True,
         "word_count_threshold": 10,
         "extraction_strategy": "NoExtractionStrategy",
-        "chunking_strategy": "RegexChunking"
+        "chunking_strategy": "RegexChunking",
     }
     response = requests.post("https://crawl4ai.com/crawl", json=data)
     response_data = response.json()
-    response_data = response_data['results'][0]
-    return response_data['markdown']
+    response_data = response_data["results"][0]
+    return response_data["markdown"]
+
 
 @cl.on_chat_start
 async def on_chat_start():
-    cl.user_session.set("session", {
-        "history": [],
-        "context": {}
-    })  
-    await cl.Message(
-        content="Welcome to the chat! How can I assist you today?"
-    ).send()
+    cl.user_session.set("session", {"history": [], "context": {}})
+    await cl.Message(content="Welcome to the chat! How can I assist you today?").send()
+
 
 @cl.on_message
 async def on_message(message: cl.Message):
     user_session = cl.user_session.get("session")
-    
+
     # Extract URLs from the user's message
     urls = extract_urls(message.content)
-    
-    
+
     futures = []
     with ThreadPoolExecutor() as executor:
         for url in urls:
@@ -69,16 +69,9 @@ async def on_message(message: cl.Message):
 
     for url, result in zip(urls, results):
         ref_number = f"REF_{len(user_session['context']) + 1}"
-        user_session["context"][ref_number] = {
-            "url": url,
-            "content": result
-        }    
+        user_session["context"][ref_number] = {"url": url, "content": result}
 
-
-    user_session["history"].append({
-        "role": "user",
-        "content": message.content
-    })
+    user_session["history"].append({"role": "user", "content": message.content})
 
     # Create a system message that includes the context
     context_messages = [
@@ -95,26 +88,17 @@ async def on_message(message: cl.Message):
                 "If not, there is no need to add a references section. "
                 "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
                 "\n\n".join(context_messages)
-            )
+            ),
         }
     else:
-        system_message = {
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }
-
+        system_message = {"role": "system", "content": "You are a helpful assistant."}
 
     msg = cl.Message(content="")
     await msg.send()
 
     # Get response from the LLM
     stream = await client.chat.completions.create(
-        messages=[
-            system_message,
-            *user_session["history"]
-        ],
-        stream=True,
-        **settings
+        messages=[system_message, *user_session["history"]], stream=True, **settings
     )
 
     assistant_response = ""
@@ -124,10 +108,7 @@ async def on_message(message: cl.Message):
             await msg.stream_token(token)
 
     # Add assistant message to the history
-    user_session["history"].append({
-        "role": "assistant",
-        "content": assistant_response
-    })
+    user_session["history"].append({"role": "assistant", "content": assistant_response})
     await msg.update()
 
     # Append the reference section to the assistant's response
@@ -154,10 +135,11 @@ async def on_audio_chunk(chunk: cl.AudioChunk):
 
     pass
 
+
 @cl.step(type="tool")
 async def speech_to_text(audio_file):
     cli = Groq()
-       
+
     response = await client.audio.transcriptions.create(
         model="whisper-large-v3", file=audio_file
     )
@@ -172,24 +154,19 @@ async def on_audio_end(elements: list[ElementBased]):
     audio_buffer.seek(0)  # Move the file pointer to the beginning
     audio_file = audio_buffer.read()
     audio_mime_type: str = cl.user_session.get("audio_mime_type")
-    
+
     start_time = time.time()
     whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
     transcription = await speech_to_text(whisper_input)
     end_time = time.time()
     print(f"Transcription took {end_time - start_time} seconds")
-    
-    user_msg = cl.Message(
-        author="You", 
-        type="user_message",
-        content=transcription
-    )
+
+    user_msg = cl.Message(author="You", type="user_message", content=transcription)
     await user_msg.send()
     await on_message(user_msg)
 
 
 if __name__ == "__main__":
     from chainlit.cli import run_chainlit
+
     run_chainlit(__file__)
-
-
diff --git a/docs/examples/rest_call.py b/docs/examples/rest_call.py
index 465c6114..47c09435 100644
--- a/docs/examples/rest_call.py
+++ b/docs/examples/rest_call.py
@@ -1,4 +1,3 @@
-
 import requests, base64, os
 
 data = {
@@ -6,59 +5,50 @@ data = {
     "screenshot": True,
 }
 
-response = requests.post("https://crawl4ai.com/crawl", json=data) 
-result = response.json()['results'][0]
+response = requests.post("https://crawl4ai.com/crawl", json=data)
+result = response.json()["results"][0]
 print(result.keys())
-# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media', 
-# 'links', 'screenshot', 'markdown', 'extracted_content', 
+# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
+# 'links', 'screenshot', 'markdown', 'extracted_content',
 # 'metadata', 'error_message'])
 with open("screenshot.png", "wb") as f:
-    f.write(base64.b64decode(result['screenshot']))
-    
+    f.write(base64.b64decode(result["screenshot"]))
+
 # Example of filtering the content using CSS selectors
 data = {
-    "urls": [
-        "https://www.nbcnews.com/business"
-    ],
+    "urls": ["https://www.nbcnews.com/business"],
     "css_selector": "article",
     "screenshot": True,
 }
 
 # Example of executing a JS script on the page before extracting the content
 data = {
-    "urls": [
-        "https://www.nbcnews.com/business"
-    ],
+    "urls": ["https://www.nbcnews.com/business"],
     "screenshot": True,
-    'js' : ["""
+    "js": [
+        """
     const loadMoreButton = Array.from(document.querySelectorAll('button')).
     find(button => button.textContent.includes('Load More'));
     loadMoreButton && loadMoreButton.click();
-    """]
+    """
+    ],
 }
 
 # Example of using a custom extraction strategy
 data = {
-    "urls": [
-        "https://www.nbcnews.com/business"
-    ],
+    "urls": ["https://www.nbcnews.com/business"],
     "extraction_strategy": "CosineStrategy",
-    "extraction_strategy_args": {
-        "semantic_filter": "inflation rent prices"
-    },
+    "extraction_strategy_args": {"semantic_filter": "inflation rent prices"},
 }
 
 # Example of using LLM to extract content
 data = {
-    "urls": [
-        "https://www.nbcnews.com/business"
-    ],
+    "urls": ["https://www.nbcnews.com/business"],
     "extraction_strategy": "LLMExtractionStrategy",
     "extraction_strategy_args": {
         "provider": "groq/llama3-8b-8192",
         "api_token": os.environ.get("GROQ_API_KEY"),
         "instruction": """I am interested in only financial news, 
-        and translate them in French."""
+        and translate them in French.""",
     },
 }
-
diff --git a/docs/examples/ssl_example.py b/docs/examples/ssl_example.py
index 410e9485..7379862c 100644
--- a/docs/examples/ssl_example.py
+++ b/docs/examples/ssl_example.py
@@ -5,42 +5,47 @@ import os
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
 
 # Create tmp directory if it doesn't exist
-parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+parent_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
 tmp_dir = os.path.join(parent_dir, "tmp")
 os.makedirs(tmp_dir, exist_ok=True)
 
+
 async def main():
     # Configure crawler to fetch SSL certificate
     config = CrawlerRunConfig(
         fetch_ssl_certificate=True,
-        cache_mode=CacheMode.BYPASS  # Bypass cache to always get fresh certificates
+        cache_mode=CacheMode.BYPASS,  # Bypass cache to always get fresh certificates
     )
 
     async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url='https://example.com',
-            config=config
-        )
-        
+        result = await crawler.arun(url="https://example.com", config=config)
+
         if result.success and result.ssl_certificate:
             cert = result.ssl_certificate
-            
+
             # 1. Access certificate properties directly
             print("\nCertificate Information:")
             print(f"Issuer: {cert.issuer.get('CN', '')}")
             print(f"Valid until: {cert.valid_until}")
             print(f"Fingerprint: {cert.fingerprint}")
-            
+
             # 2. Export certificate in different formats
             cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
             print("\nCertificate exported to:")
             print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
-            
-            pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))  # For web servers
+
+            pem_data = cert.to_pem(
+                os.path.join(tmp_dir, "certificate.pem")
+            )  # For web servers
             print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
-            
-            der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der"))  # For Java apps
+
+            der_data = cert.to_der(
+                os.path.join(tmp_dir, "certificate.der")
+            )  # For Java apps
             print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
 
+
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/docs/examples/summarize_page.py b/docs/examples/summarize_page.py
index 85158999..da2bcd21 100644
--- a/docs/examples/summarize_page.py
+++ b/docs/examples/summarize_page.py
@@ -1,39 +1,41 @@
 import os
-import time
 import json
 from crawl4ai.web_crawler import WebCrawler
 from crawl4ai.chunking_strategy import *
 from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *
 
-url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'
+url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"
 
 crawler = WebCrawler()
 crawler.warmup()
 
 from pydantic import BaseModel, Field
 
+
 class PageSummary(BaseModel):
     title: str = Field(..., description="Title of the page.")
     summary: str = Field(..., description="Summary of the page.")
     brief_summary: str = Field(..., description="Brief summary of the page.")
     keywords: list = Field(..., description="Keywords assigned to the page.")
 
+
 result = crawler.run(
     url=url,
     word_count_threshold=1,
-    extraction_strategy= LLMExtractionStrategy(
-        provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
+    extraction_strategy=LLMExtractionStrategy(
+        provider="openai/gpt-4o",
+        api_token=os.getenv("OPENAI_API_KEY"),
         schema=PageSummary.model_json_schema(),
         extraction_type="schema",
-        apply_chunking =False,
-        instruction="From the crawled content, extract the following details: "\
-            "1. Title of the page "\
-            "2. Summary of the page, which is a detailed summary "\
-            "3. Brief summary of the page, which is a paragraph text "\
-            "4. Keywords assigned to the page, which is a list of keywords. "\
-            'The extracted JSON format should look like this: '\
-            '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
+        apply_chunking=False,
+        instruction="From the crawled content, extract the following details: "
+        "1. Title of the page "
+        "2. Summary of the page, which is a detailed summary "
+        "3. Brief summary of the page, which is a paragraph text "
+        "4. Keywords assigned to the page, which is a list of keywords. "
+        "The extracted JSON format should look like this: "
+        '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }',
     ),
     bypass_cache=True,
 )
diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py
index 362ae8fc..4938db7b 100644
--- a/docs/examples/v0.3.74.overview.py
+++ b/docs/examples/v0.3.74.overview.py
@@ -1,4 +1,5 @@
 import os, sys
+
 # append the parent directory to the sys.path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
@@ -13,19 +14,18 @@ import json
 from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.content_filter_strategy import BM25ContentFilter
 
+
 # 1. File Download Processing Example
 async def download_example():
     """Example of downloading files from Python.org"""
     # downloads_path = os.path.join(os.getcwd(), "downloads")
     downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
     os.makedirs(downloads_path, exist_ok=True)
-    
+
     print(f"Downloads will be saved to: {downloads_path}")
-    
+
     async with AsyncWebCrawler(
-        accept_downloads=True,
-        downloads_path=downloads_path,
-        verbose=True
+        accept_downloads=True, downloads_path=downloads_path, verbose=True
     ) as crawler:
         result = await crawler.arun(
             url="https://www.python.org/downloads/",
@@ -40,9 +40,9 @@ async def download_example():
             }
             """,
             delay_before_return_html=1,  # Wait 5 seconds to ensure download starts
-            cache_mode=CacheMode.BYPASS
+            cache_mode=CacheMode.BYPASS,
         )
-        
+
         if result.downloaded_files:
             print("\nDownload successful!")
             print("Downloaded files:")
@@ -52,25 +52,26 @@ async def download_example():
         else:
             print("\nNo files were downloaded")
 
+
 # 2. Local File and Raw HTML Processing Example
 async def local_and_raw_html_example():
     """Example of processing local files and raw HTML"""
     # Create a sample HTML file
     sample_file = os.path.join(__data__, "sample.html")
     with open(sample_file, "w") as f:
-        f.write("""
+        f.write(
+            """
         <html><body>
             <h1>Test Content</h1>
             <p>This is a test paragraph.</p>
         </body></html>
-        """)
-    
+        """
+        )
+
     async with AsyncWebCrawler(verbose=True) as crawler:
         # Process local file
-        local_result = await crawler.arun(
-            url=f"file://{os.path.abspath(sample_file)}"
-        )
-        
+        local_result = await crawler.arun(url=f"file://{os.path.abspath(sample_file)}")
+
         # Process raw HTML
         raw_html = """
         <html><body>
@@ -78,16 +79,15 @@ async def local_and_raw_html_example():
             <p>This is a test of raw HTML processing.</p>
         </body></html>
         """
-        raw_result = await crawler.arun(
-            url=f"raw:{raw_html}"
-        )
-        
+        raw_result = await crawler.arun(url=f"raw:{raw_html}")
+
         # Clean up
         os.remove(sample_file)
-        
+
         print("Local file content:", local_result.markdown)
         print("\nRaw HTML content:", raw_result.markdown)
 
+
 # 3. Enhanced Markdown Generation Example
 async def markdown_generation_example():
     """Example of enhanced markdown generation with citations and LLM-friendly features"""
@@ -97,58 +97,66 @@ async def markdown_generation_example():
             # user_query="History and cultivation",
             bm25_threshold=1.0
         )
-        
+
         result = await crawler.arun(
             url="https://en.wikipedia.org/wiki/Apple",
             css_selector="main div#bodyContent",
             content_filter=content_filter,
-            cache_mode=CacheMode.BYPASS
+            cache_mode=CacheMode.BYPASS,
         )
-        
-        from crawl4ai import AsyncWebCrawler
+
         from crawl4ai.content_filter_strategy import BM25ContentFilter
-        
+
         result = await crawler.arun(
             url="https://en.wikipedia.org/wiki/Apple",
             css_selector="main div#bodyContent",
-            content_filter=BM25ContentFilter()
+            content_filter=BM25ContentFilter(),
         )
         print(result.markdown_v2.fit_markdown)
-        
+
         print("\nMarkdown Generation Results:")
         print(f"1. Original markdown length: {len(result.markdown)}")
-        print(f"2. New markdown versions (markdown_v2):")
+        print("2. New markdown versions (markdown_v2):")
         print(f"   - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
-        print(f"   - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}")
-        print(f"   - References section length: {len(result.markdown_v2.references_markdown)}")
+        print(
+            f"   - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}"
+        )
+        print(
+            f"   - References section length: {len(result.markdown_v2.references_markdown)}"
+        )
         if result.markdown_v2.fit_markdown:
-            print(f"   - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}")
-        
+            print(
+                f"   - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}"
+            )
+
         # Save examples to files
         output_dir = os.path.join(__data__, "markdown_examples")
         os.makedirs(output_dir, exist_ok=True)
-        
+
         # Save different versions
         with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
             f.write(result.markdown_v2.raw_markdown)
-            
+
         with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
             f.write(result.markdown_v2.markdown_with_citations)
-            
+
         with open(os.path.join(output_dir, "3_references.md"), "w") as f:
             f.write(result.markdown_v2.references_markdown)
-            
+
         if result.markdown_v2.fit_markdown:
             with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
                 f.write(result.markdown_v2.fit_markdown)
-                
+
         print(f"\nMarkdown examples saved to: {output_dir}")
-        
+
         # Show a sample of citations and references
         print("\nSample of markdown with citations:")
         print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
         print("Sample of references:")
-        print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...")
+        print(
+            "\n".join(result.markdown_v2.references_markdown.split("\n")[:10]) + "..."
+        )
+
 
 # 4. Browser Management Example
 async def browser_management_example():
@@ -156,38 +164,38 @@ async def browser_management_example():
     # Use the specified user directory path
     user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
     os.makedirs(user_data_dir, exist_ok=True)
-    
+
     print(f"Browser profile will be saved to: {user_data_dir}")
-    
+
     async with AsyncWebCrawler(
         use_managed_browser=True,
         user_data_dir=user_data_dir,
         headless=False,
-        verbose=True
+        verbose=True,
     ) as crawler:
-
         result = await crawler.arun(
             url="https://crawl4ai.com",
             # session_id="persistent_session_1",
-            cache_mode=CacheMode.BYPASS
-        )        
+            cache_mode=CacheMode.BYPASS,
+        )
         # Use GitHub as an example - it's a good test for browser management
         # because it requires proper browser handling
         result = await crawler.arun(
             url="https://github.com/trending",
             # session_id="persistent_session_1",
-            cache_mode=CacheMode.BYPASS
+            cache_mode=CacheMode.BYPASS,
         )
-        
+
         print("\nBrowser session result:", result.success)
         if result.success:
-            print("Page title:", result.metadata.get('title', 'No title found'))
+            print("Page title:", result.metadata.get("title", "No title found"))
+
 
 # 5. API Usage Example
 async def api_example():
     """Example of using the new API endpoints"""
-    api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"
-    headers = {'Authorization': f'Bearer {api_token}'}    
+    api_token = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
+    headers = {"Authorization": f"Bearer {api_token}"}
     async with aiohttp.ClientSession() as session:
         # Submit crawl job
         crawl_request = {
@@ -199,25 +207,17 @@ async def api_example():
                         "name": "Hacker News Articles",
                         "baseSelector": ".athing",
                         "fields": [
-                            {
-                                "name": "title",
-                                "selector": ".title a",
-                                "type": "text"
-                            },
-                            {
-                                "name": "score",
-                                "selector": ".score",
-                                "type": "text"
-                            },
+                            {"name": "title", "selector": ".title a", "type": "text"},
+                            {"name": "score", "selector": ".score", "type": "text"},
                             {
                                 "name": "url",
                                 "selector": ".title a",
                                 "type": "attribute",
-                                "attribute": "href"
-                            }
-                        ]
+                                "attribute": "href",
+                            },
+                        ],
                     }
-                }
+                },
             },
             "crawler_params": {
                 "headless": True,
@@ -227,51 +227,50 @@ async def api_example():
             # "screenshot": True,
             # "magic": True
         }
-        
+
         async with session.post(
-            "http://localhost:11235/crawl",
-            json=crawl_request,
-            headers=headers
+            "http://localhost:11235/crawl", json=crawl_request, headers=headers
         ) as response:
             task_data = await response.json()
             task_id = task_data["task_id"]
-            
+
             # Check task status
             while True:
                 async with session.get(
-                    f"http://localhost:11235/task/{task_id}",
-                    headers=headers
+                    f"http://localhost:11235/task/{task_id}", headers=headers
                 ) as status_response:
                     result = await status_response.json()
                     print(f"Task status: {result['status']}")
-                    
+
                     if result["status"] == "completed":
                         print("Task completed!")
                         print("Results:")
-                        news = json.loads(result["results"][0]['extracted_content'])
+                        news = json.loads(result["results"][0]["extracted_content"])
                         print(json.dumps(news[:4], indent=2))
                         break
                     else:
                         await asyncio.sleep(1)
 
+
 # Main execution
 async def main():
     # print("Running Crawl4AI feature examples...")
-    
+
     # print("\n1. Running Download Example:")
     # await download_example()
-    
+
     # print("\n2. Running Markdown Generation Example:")
     # await markdown_generation_example()
-    
+
     # # print("\n3. Running Local and Raw HTML Example:")
     # await local_and_raw_html_example()
-    
+
     # # print("\n4. Running Browser Management Example:")
     await browser_management_example()
-    
+
     # print("\n5. Running API Example:")
     await api_example()
 
+
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/docs/examples/v0_4_24_walkthrough.py b/docs/examples/v0_4_24_walkthrough.py
index 135ac29c..996e7b04 100644
--- a/docs/examples/v0_4_24_walkthrough.py
+++ b/docs/examples/v0_4_24_walkthrough.py
@@ -10,18 +10,17 @@ import asyncio
 import os
 import json
 import re
-from typing import List, Optional, Dict, Any
-from pydantic import BaseModel, Field
+from typing import List
 from crawl4ai import (
     AsyncWebCrawler,
     BrowserConfig,
     CrawlerRunConfig,
     CacheMode,
     LLMExtractionStrategy,
-    JsonCssExtractionStrategy
+    JsonCssExtractionStrategy,
 )
 from crawl4ai.content_filter_strategy import RelevantContentFilter
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator 
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from bs4 import BeautifulSoup
 
 # Sample HTML for demonstrations
@@ -52,17 +51,18 @@ SAMPLE_HTML = """
 </div>
 """
 
+
 async def demo_ssl_features():
     """
     Enhanced SSL & Security Features Demo
     -----------------------------------
-    
+
     This example demonstrates the new SSL certificate handling and security features:
     1. Custom certificate paths
     2. SSL verification options
     3. HTTPS error handling
     4. Certificate validation configurations
-    
+
     These features are particularly useful when:
     - Working with self-signed certificates
     - Dealing with corporate proxies
@@ -76,14 +76,11 @@ async def demo_ssl_features():
 
     run_config = CrawlerRunConfig(
         cache_mode=CacheMode.BYPASS,
-        fetch_ssl_certificate=True  # Enable SSL certificate fetching
+        fetch_ssl_certificate=True,  # Enable SSL certificate fetching
     )
 
     async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(
-            url="https://example.com",
-            config=run_config
-        )
+        result = await crawler.arun(url="https://example.com", config=run_config)
         print(f"SSL Crawl Success: {result.success}")
         result.ssl_certificate.to_json(
             os.path.join(os.getcwd(), "ssl_certificate.json")
@@ -91,11 +88,12 @@ async def demo_ssl_features():
         if not result.success:
             print(f"SSL Error: {result.error_message}")
 
+
 async def demo_content_filtering():
     """
     Smart Content Filtering Demo
     ----------------------
-    
+
     Demonstrates advanced content filtering capabilities:
     1. Custom filter to identify and extract specific content
     2. Integration with markdown generation
@@ -110,87 +108,90 @@ async def demo_content_filtering():
             super().__init__()
             # Add news-specific patterns
             self.negative_patterns = re.compile(
-                r'nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending',
-                re.I
+                r"nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending",
+                re.I,
             )
             self.min_word_count = 30  # Higher threshold for news content
 
-        def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
+        def filter_content(
+            self, html: str, min_word_threshold: int = None
+        ) -> List[str]:
             """
             Implements news-specific content filtering logic.
-            
+
             Args:
                 html (str): HTML content to be filtered
                 min_word_threshold (int, optional): Minimum word count threshold
-                
+
             Returns:
                 List[str]: List of filtered HTML content blocks
             """
             if not html or not isinstance(html, str):
                 return []
-                
-            soup = BeautifulSoup(html, 'lxml')
+
+            soup = BeautifulSoup(html, "lxml")
             if not soup.body:
-                soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
-            
-            body = soup.find('body')
-            
+                soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
+
+            body = soup.find("body")
+
             # Extract chunks with metadata
-            chunks = self.extract_text_chunks(body, min_word_threshold or self.min_word_count)
-            
+            chunks = self.extract_text_chunks(
+                body, min_word_threshold or self.min_word_count
+            )
+
             # Filter chunks based on news-specific criteria
             filtered_chunks = []
             for _, text, tag_type, element in chunks:
                 # Skip if element has negative class/id
                 if self.is_excluded(element):
                     continue
-                    
+
                 # Headers are important in news articles
-                if tag_type == 'header':
+                if tag_type == "header":
                     filtered_chunks.append(self.clean_element(element))
                     continue
-                    
+
                 # For content, check word count and link density
                 text = element.get_text(strip=True)
                 if len(text.split()) >= (min_word_threshold or self.min_word_count):
                     # Calculate link density
-                    links_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a'))
+                    links_text = " ".join(
+                        a.get_text(strip=True) for a in element.find_all("a")
+                    )
                     link_density = len(links_text) / len(text) if text else 1
-                    
+
                     # Accept if link density is reasonable
                     if link_density < 0.5:
                         filtered_chunks.append(self.clean_element(element))
-            
+
             return filtered_chunks
 
     # Create markdown generator with custom filter
-    markdown_gen = DefaultMarkdownGenerator(
-        content_filter=CustomNewsFilter()
-    )
+    markdown_gen = DefaultMarkdownGenerator(content_filter=CustomNewsFilter())
 
     run_config = CrawlerRunConfig(
-        markdown_generator=markdown_gen,
-        cache_mode=CacheMode.BYPASS
+        markdown_generator=markdown_gen, cache_mode=CacheMode.BYPASS
     )
 
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
-            url="https://news.ycombinator.com",
-            config=run_config
+            url="https://news.ycombinator.com", config=run_config
         )
         print("Filtered Content Sample:")
         print(result.markdown[:500])  # Show first 500 chars
 
+
 async def demo_json_extraction():
     """
     Improved JSON Extraction Demo
     ---------------------------
-    
+
     Demonstrates the enhanced JSON extraction capabilities:
     1. Base element attributes extraction
     2. Complex nested structures
     3. Multiple extraction patterns
-    
+
     Key features shown:
     - Extracting attributes from base elements (href, data-* attributes)
     - Processing repeated patterns
@@ -206,7 +207,7 @@ async def demo_json_extraction():
             "baseSelector": "div.article-list",
             "baseFields": [
                 {"name": "list_id", "type": "attribute", "attribute": "data-list-id"},
-                {"name": "category", "type": "attribute", "attribute": "data-category"}
+                {"name": "category", "type": "attribute", "attribute": "data-category"},
             ],
             "fields": [
                 {
@@ -214,8 +215,16 @@ async def demo_json_extraction():
                     "selector": "article.post",
                     "type": "nested_list",
                     "baseFields": [
-                        {"name": "post_id", "type": "attribute", "attribute": "data-post-id"},
-                        {"name": "author_id", "type": "attribute", "attribute": "data-author"}
+                        {
+                            "name": "post_id",
+                            "type": "attribute",
+                            "attribute": "data-post-id",
+                        },
+                        {
+                            "name": "author_id",
+                            "type": "attribute",
+                            "attribute": "data-author",
+                        },
                     ],
                     "fields": [
                         {
@@ -223,60 +232,68 @@ async def demo_json_extraction():
                             "selector": "h2.title a",
                             "type": "text",
                             "baseFields": [
-                                {"name": "url", "type": "attribute", "attribute": "href"}
-                            ]
+                                {
+                                    "name": "url",
+                                    "type": "attribute",
+                                    "attribute": "href",
+                                }
+                            ],
                         },
                         {
                             "name": "author",
                             "selector": "div.meta a.author",
                             "type": "text",
                             "baseFields": [
-                                {"name": "profile_url", "type": "attribute", "attribute": "href"}
-                            ]
-                        },
-                        {
-                            "name": "date",
-                            "selector": "span.date",
-                            "type": "text"
+                                {
+                                    "name": "profile_url",
+                                    "type": "attribute",
+                                    "attribute": "href",
+                                }
+                            ],
                         },
+                        {"name": "date", "selector": "span.date", "type": "text"},
                         {
                             "name": "read_more",
                             "selector": "a.read-more",
                             "type": "nested",
                             "fields": [
                                 {"name": "text", "type": "text"},
-                                {"name": "url", "type": "attribute", "attribute": "href"}
-                            ]
-                        }
-                    ]
+                                {
+                                    "name": "url",
+                                    "type": "attribute",
+                                    "attribute": "href",
+                                },
+                            ],
+                        },
+                    ],
                 }
-            ]
+            ],
         }
     )
 
     # Demonstrate extraction from raw HTML
     run_config = CrawlerRunConfig(
-        extraction_strategy=json_strategy,
-        cache_mode=CacheMode.BYPASS
+        extraction_strategy=json_strategy, cache_mode=CacheMode.BYPASS
     )
 
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
             url="raw:" + SAMPLE_HTML,  # Use raw: prefix for raw HTML
-            config=run_config
+            config=run_config,
         )
         print("Extracted Content:")
         print(result.extracted_content)
 
+
 async def demo_input_formats():
     """
     Input Format Handling Demo
     ----------------------
-    
+
     Demonstrates how LLM extraction can work with different input formats:
     1. Markdown (default) - Good for simple text extraction
     2. HTML - Better when you need structure and attributes
-    
+
     This example shows how HTML input can be beneficial when:
     - You need to understand the DOM structure
     - You want to extract both visible text and HTML attributes
@@ -350,7 +367,7 @@ async def demo_input_formats():
         </footer>
     </div>
     """
-    
+
     # Use raw:// prefix to pass HTML content directly
     url = f"raw://{dummy_html}"
 
@@ -359,18 +376,30 @@ async def demo_input_formats():
 
     # Define our schema using Pydantic
     class JobRequirement(BaseModel):
-        category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)")
-        items: List[str] = Field(description="List of specific requirements in this category")
-        priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context")
+        category: str = Field(
+            description="Category of the requirement (e.g., Technical, Soft Skills)"
+        )
+        items: List[str] = Field(
+            description="List of specific requirements in this category"
+        )
+        priority: str = Field(
+            description="Priority level (Required/Preferred) based on the HTML class or context"
+        )
 
     class JobPosting(BaseModel):
         title: str = Field(description="Job title")
         department: str = Field(description="Department or team")
         location: str = Field(description="Job location, including remote options")
         salary_range: Optional[str] = Field(description="Salary range if specified")
-        requirements: List[JobRequirement] = Field(description="Categorized job requirements")
-        application_deadline: Optional[str] = Field(description="Application deadline if specified")
-        contact_info: Optional[dict] = Field(description="Contact information from footer or contact section")
+        requirements: List[JobRequirement] = Field(
+            description="Categorized job requirements"
+        )
+        application_deadline: Optional[str] = Field(
+            description="Application deadline if specified"
+        )
+        contact_info: Optional[dict] = Field(
+            description="Contact information from footer or contact section"
+        )
 
     # First try with markdown (default)
     markdown_strategy = LLMExtractionStrategy(
@@ -382,7 +411,7 @@ async def demo_input_formats():
         Extract job posting details into structured data. Focus on the visible text content 
         and organize requirements into categories.
         """,
-        input_format="markdown"  # default
+        input_format="markdown",  # default
     )
 
     # Then with HTML for better structure understanding
@@ -400,34 +429,25 @@ async def demo_input_formats():
         
         Use HTML attributes and classes to enhance extraction accuracy.
         """,
-        input_format="html"  # explicitly use HTML
+        input_format="html",  # explicitly use HTML
     )
 
     async with AsyncWebCrawler() as crawler:
         # Try with markdown first
-        markdown_config = CrawlerRunConfig(
-            extraction_strategy=markdown_strategy
-        )
-        markdown_result = await crawler.arun(
-            url=url,
-            config=markdown_config
-        )
+        markdown_config = CrawlerRunConfig(extraction_strategy=markdown_strategy)
+        markdown_result = await crawler.arun(url=url, config=markdown_config)
         print("\nMarkdown-based Extraction Result:")
         items = json.loads(markdown_result.extracted_content)
         print(json.dumps(items, indent=2))
 
         # Then with HTML for better structure understanding
-        html_config = CrawlerRunConfig(
-            extraction_strategy=html_strategy
-        )
-        html_result = await crawler.arun(
-            url=url,
-            config=html_config
-        )
+        html_config = CrawlerRunConfig(extraction_strategy=html_strategy)
+        html_result = await crawler.arun(url=url, config=html_config)
         print("\nHTML-based Extraction Result:")
         items = json.loads(html_result.extracted_content)
         print(json.dumps(items, indent=2))
 
+
 # Main execution
 async def main():
     print("Crawl4AI v0.4.24 Feature Walkthrough")
@@ -439,5 +459,6 @@ async def main():
     await demo_json_extraction()
     # await demo_input_formats()
 
+
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/docs/md_v3/tutorials/advanced-features.md b/docs/md_v2/advanced/advanced-features.md
similarity index 88%
rename from docs/md_v3/tutorials/advanced-features.md
rename to docs/md_v2/advanced/advanced-features.md
index 16f85874..1f402948 100644
--- a/docs/md_v3/tutorials/advanced-features.md
+++ b/docs/md_v2/advanced/advanced-features.md
@@ -1,15 +1,16 @@
-# Advanced Features (Proxy, PDF, Screenshot, SSL, Headers, & Storage State)
+# Overview of Some Important Advanced Features 
+(Proxy, PDF, Screenshot, SSL, Headers, & Storage State)
 
 Crawl4AI offers multiple power-user features that go beyond simple crawling. This tutorial covers:
 
-1. **Proxy Usage**  
-2. **Capturing PDFs & Screenshots**  
-3. **Handling SSL Certificates**  
-4. **Custom Headers**  
-5. **Session Persistence & Local Storage**
+1. **Proxy Usage**  
+2. **Capturing PDFs & Screenshots**  
+3. **Handling SSL Certificates**  
+4. **Custom Headers**  
+5. **Session Persistence & Local Storage**
 
 > **Prerequisites**  
-> - You have a basic grasp of [AsyncWebCrawler Basics](./async-webcrawler-basics.md)  
+> - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md)  
 > - You know how to run or configure your Python environment with Playwright installed
 
 ---
@@ -84,7 +85,7 @@ async def main():
             # Save PDF
             if result.pdf:
                 with open("wikipedia_page.pdf", "wb") as f:
-                    f.write(b64decode(result.pdf))
+                    f.write(result.pdf)
             
             print("[OK] PDF & screenshot captured.")
         else:
@@ -186,7 +187,7 @@ if __name__ == "__main__":
 
 **Notes**  
 - Some sites may react differently to certain headers (e.g., `Accept-Language`).  
-- If you need advanced user-agent randomization or client hints, see [Identity-Based Crawling (Anti-Bot)](./identity-anti-bot.md) or use `UserAgentGenerator`.
+- If you need advanced user-agent randomization or client hints, see [Identity-Based Crawling (Anti-Bot)](./identity-based-crawling.md) or use `UserAgentGenerator`.
 
 ---
 
@@ -246,7 +247,7 @@ You can sign in once, export the browser context, and reuse it later—without r
 - **`await context.storage_state(path="my_storage.json")`**: Exports cookies, localStorage, etc. to a file.  
 - Provide `storage_state="my_storage.json"` on subsequent runs to skip the login step.
 
-**See**: [Detailed session management tutorial](./hooks-custom.md#using-storage_state) or [Explanations → Browser Context & Managed Browser](../../explanations/browser-management.md) for more advanced scenarios (like multi-step logins, or capturing after interactive pages).
+**See**: [Detailed session management tutorial](./session-management.md) or [Explanations → Browser Context & Managed Browser](./identity-based-crawling.md) for more advanced scenarios (like multi-step logins, or capturing after interactive pages).
 
 ---
 
@@ -283,7 +284,10 @@ async def main():
 
     # 3. Crawl
     async with AsyncWebCrawler(config=browser_cfg) as crawler:
-        result = await crawler.arun("https://secure.example.com/protected", config=crawler_cfg)
+        result = await crawler.arun(
+            url = "https://secure.example.com/protected", 
+            config=crawler_cfg
+        )
         
         if result.success:
             print("[OK] Crawled the secure page. Links found:", len(result.links.get("internal", [])))
@@ -318,12 +322,6 @@ You’ve now explored several **advanced** features:
 - **Custom Headers** for language or specialized requests  
 - **Session Persistence** via storage state
 
-**Where to go next**:
-
-- **[Hooks & Custom Code](./hooks-custom.md)**: For multi-step interactions (clicking “Load More,” performing logins, etc.)  
-- **[Identity-Based Crawling & Anti-Bot](./identity-anti-bot.md)**: If you need more sophisticated user simulation or stealth.  
-- **[Reference → BrowserConfig & CrawlerRunConfig](../../reference/configuration.md)**: Detailed param descriptions for everything you’ve seen here and more.
-
 With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.
 
-**Last Updated**: 2024-XX-XX
\ No newline at end of file
+**Last Updated**: 2025-01-01
\ No newline at end of file
diff --git a/docs/md_v2/advanced/content-processing.md b/docs/md_v2/advanced/content-processing.md
deleted file mode 100644
index 25ed6172..00000000
--- a/docs/md_v2/advanced/content-processing.md
+++ /dev/null
@@ -1,136 +0,0 @@
-# Content Processing
-
-Crawl4AI provides powerful content processing capabilities that help you extract clean, relevant content from web pages. This guide covers content cleaning, media handling, link analysis, and metadata extraction.
-
-## Media Processing
-
-Crawl4AI provides comprehensive media extraction and analysis capabilities. It automatically detects and processes various types of media elements while maintaining their context and relevance.
-
-### Image Processing
-
-The library handles various image scenarios, including:
-- Regular images
-- Lazy-loaded images
-- Background images
-- Responsive images
-- Image metadata and context
-
-```python
-from crawl4ai.async_configs import CrawlerRunConfig
-
-config = CrawlerRunConfig()
-result = await crawler.arun(url="https://example.com", config=config)
-
-for image in result.media["images"]:
-    # Each image includes rich metadata
-    print(f"Source: {image['src']}")
-    print(f"Alt text: {image['alt']}")
-    print(f"Description: {image['desc']}")
-    print(f"Context: {image['context']}")  # Surrounding text
-    print(f"Relevance score: {image['score']}")  # 0-10 score
-```
-
-### Handling Lazy-Loaded Content
-
-Crawl4AI already handles lazy loading for media elements. You can customize the wait time for lazy-loaded content with `CrawlerRunConfig`:
-
-```python
-config = CrawlerRunConfig(
-    wait_for="css:img[data-src]",  # Wait for lazy images
-    delay_before_return_html=2.0   # Additional wait time
-)
-result = await crawler.arun(url="https://example.com", config=config)
-```
-
-### Video and Audio Content
-
-The library extracts video and audio elements with their metadata:
-
-```python
-from crawl4ai.async_configs import CrawlerRunConfig
-
-config = CrawlerRunConfig()
-result = await crawler.arun(url="https://example.com", config=config)
-
-# Process videos
-for video in result.media["videos"]:
-    print(f"Video source: {video['src']}")
-    print(f"Type: {video['type']}")
-    print(f"Duration: {video.get('duration')}")
-    print(f"Thumbnail: {video.get('poster')}")
-
-# Process audio
-for audio in result.media["audios"]:
-    print(f"Audio source: {audio['src']}")
-    print(f"Type: {audio['type']}")
-    print(f"Duration: {audio.get('duration')}")
-```
-
-## Link Analysis
-
-Crawl4AI provides sophisticated link analysis capabilities, helping you understand the relationship between pages and identify important navigation patterns.
-
-### Link Classification
-
-The library automatically categorizes links into:
-- Internal links (same domain)
-- External links (different domains)
-- Social media links
-- Navigation links
-- Content links
-
-```python
-from crawl4ai.async_configs import CrawlerRunConfig
-
-config = CrawlerRunConfig()
-result = await crawler.arun(url="https://example.com", config=config)
-
-# Analyze internal links
-for link in result.links["internal"]:
-    print(f"Internal: {link['href']}")
-    print(f"Link text: {link['text']}")
-    print(f"Context: {link['context']}")  # Surrounding text
-    print(f"Type: {link['type']}")  # nav, content, etc.
-
-# Analyze external links
-for link in result.links["external"]:
-    print(f"External: {link['href']}")
-    print(f"Domain: {link['domain']}")
-    print(f"Type: {link['type']}")
-```
-
-### Smart Link Filtering
-
-Control which links are included in the results with `CrawlerRunConfig`:
-
-```python
-config = CrawlerRunConfig(
-    exclude_external_links=True,          # Remove external links
-    exclude_social_media_links=True,      # Remove social media links
-    exclude_social_media_domains=[        # Custom social media domains
-        "facebook.com", "twitter.com", "instagram.com"
-    ],
-    exclude_domains=["ads.example.com"]   # Exclude specific domains
-)
-result = await crawler.arun(url="https://example.com", config=config)
-```
-
-## Metadata Extraction
-
-Crawl4AI automatically extracts and processes page metadata, providing valuable information about the content:
-
-```python
-from crawl4ai.async_configs import CrawlerRunConfig
-
-config = CrawlerRunConfig()
-result = await crawler.arun(url="https://example.com", config=config)
-
-metadata = result.metadata
-print(f"Title: {metadata['title']}")
-print(f"Description: {metadata['description']}")
-print(f"Keywords: {metadata['keywords']}")
-print(f"Author: {metadata['author']}")
-print(f"Published Date: {metadata['published_date']}")
-print(f"Modified Date: {metadata['modified_date']}")
-print(f"Language: {metadata['language']}")
-```
diff --git a/docs/md_v2/advanced/crawl-dispatcher.md b/docs/md_v2/advanced/crawl-dispatcher.md
new file mode 100644
index 00000000..e4059f25
--- /dev/null
+++ b/docs/md_v2/advanced/crawl-dispatcher.md
@@ -0,0 +1,12 @@
+# Crawl Dispatcher
+
+We’re excited to announce a **Crawl Dispatcher** module that can handle **thousands** of crawling tasks simultaneously. By efficiently managing system resources (memory, CPU, network), this dispatcher ensures high-performance data extraction at scale. It also provides **real-time monitoring** of each crawler’s status, memory usage, and overall progress.
+
+Stay tuned—this feature is **coming soon** in an upcoming release of Crawl4AI! For the latest news, keep an eye on our changelogs and follow [@unclecode](https://twitter.com/unclecode) on X.
+
+Below is a **sample** of how the dispatcher’s performance monitor might look in action:
+
+![Crawl Dispatcher Performance Monitor](../assets/images/dispatcher.png)
+
+
+We can’t wait to bring you this streamlined, **scalable** approach to multi-URL crawling—**watch this space** for updates!
\ No newline at end of file
diff --git a/docs/md_v2/basic/file-download.md b/docs/md_v2/advanced/file-downloading.md
similarity index 92%
rename from docs/md_v2/basic/file-download.md
rename to docs/md_v2/advanced/file-downloading.md
index eac0f5cb..2fa3759f 100644
--- a/docs/md_v2/basic/file-download.md
+++ b/docs/md_v2/advanced/file-downloading.md
@@ -17,18 +17,6 @@ async def main():
 asyncio.run(main())
 ```
 
-Or, enable it for a specific crawl by using `CrawlerRunConfig`:
-
-```python
-from crawl4ai.async_configs import CrawlerRunConfig
-
-async def main():
-    async with AsyncWebCrawler() as crawler:
-        config = CrawlerRunConfig(accept_downloads=True)
-        result = await crawler.arun(url="https://example.com", config=config)
-        # ...
-```
-
 ## Specifying Download Location
 
 Specify the download directory using the `downloads_path` attribute in the `BrowserConfig` object. If not provided, Crawl4AI defaults to creating a "downloads" directory inside the `.crawl4ai` folder in your home directory.
@@ -98,7 +86,8 @@ async def download_multiple_files(url: str, download_path: str):
                 const downloadLinks = document.querySelectorAll('a[download]');
                 for (const link of downloadLinks) {
                     link.click();
-                    await new Promise(r => setTimeout(r, 2000));  // Delay between clicks
+                    // Delay between clicks
+                    await new Promise(r => setTimeout(r, 2000));  
                 }
             """,
             wait_for=10  # Wait for all downloads to start
diff --git a/docs/md_v2/advanced/hooks-auth.md b/docs/md_v2/advanced/hooks-auth.md
index 66042229..6787abd9 100644
--- a/docs/md_v2/advanced/hooks-auth.md
+++ b/docs/md_v2/advanced/hooks-auth.md
@@ -1,121 +1,254 @@
-# Hooks & Auth for AsyncWebCrawler
+# Hooks & Auth in AsyncWebCrawler
 
-Crawl4AI's `AsyncWebCrawler` allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This updated documentation demonstrates how to use hooks, including the new `on_page_context_created` hook, and ensures compatibility with `BrowserConfig` and `CrawlerRunConfig`.
+Crawl4AI’s **hooks** let you customize the crawler at specific points in the pipeline:
 
-## Example: Using Crawler Hooks with AsyncWebCrawler
+1. **`on_browser_created`** – After browser creation.  
+2. **`on_page_context_created`** – After a new context & page are created.  
+3. **`before_goto`** – Just before navigating to a page.  
+4. **`after_goto`** – Right after navigation completes.  
+5. **`on_user_agent_updated`** – Whenever the user agent changes.  
+6. **`on_execution_started`** – Once custom JavaScript execution begins.  
+7. **`before_retrieve_html`** – Just before the crawler retrieves final HTML.  
+8. **`before_return_html`** – Right before returning the HTML content.
 
-In this example, we'll:
+**Important**: Avoid heavy tasks in `on_browser_created` since you don’t yet have a page context. If you need to *log in*, do so in **`on_page_context_created`**.
 
-1. Configure the browser and set up authentication when it's created.
-2. Apply custom routing and initial actions when the page context is created.
-3. Add custom headers before navigating to the URL.
-4. Log the current URL after navigation.
-5. Perform actions after JavaScript execution.
-6. Log the length of the HTML before returning it.
+> note "Important Hook Usage Warning"
+    **Avoid Misusing Hooks**: Do not manipulate page objects in the wrong hook or at the wrong time, as it can crash the pipeline or produce incorrect results. A common mistake is attempting to handle authentication prematurely—such as creating or closing pages in `on_browser_created`. 
 
-### Hook Definitions
+>   **Use the Right Hook for Auth**: If you need to log in or set tokens, use `on_page_context_created`. This ensures you have a valid page/context to work with, without disrupting the main crawling flow.
+
+>    **Identity-Based Crawling**: For robust auth, consider identity-based crawling (or passing a session ID) to preserve state. Run your initial login steps in a separate, well-defined process, then feed that session to your main crawl—rather than shoehorning complex authentication into early hooks. Check out [Identity-Based Crawling](../advanced/identity-based-crawling.md) for more details.
+
+>    **Be Cautious**: Overwriting or removing elements in the wrong hook can compromise the final crawl. Keep hooks focused on smaller tasks (like route filters, custom headers), and let your main logic (crawling, data extraction) proceed normally.
+
+
+Below is an example demonstration.
+
+---
+
+## Example: Using Hooks in AsyncWebCrawler
 
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
-from playwright.async_api import Page, Browser, BrowserContext
+import json
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from playwright.async_api import Page, BrowserContext
 
-def log_routing(route):
-    # Example: block loading images
-    if route.request.resource_type == "image":
-        print(f"[HOOK] Blocking image request: {route.request.url}")
-        asyncio.create_task(route.abort())
-    else:
-        asyncio.create_task(route.continue_())
-
-async def on_browser_created(browser: Browser, **kwargs):
-    print("[HOOK] on_browser_created")
-    # Example: Set browser viewport size and log in
-    context = await browser.new_context(viewport={"width": 1920, "height": 1080})
-    page = await context.new_page()
-    await page.goto("https://example.com/login")
-    await page.fill("input[name='username']", "testuser")
-    await page.fill("input[name='password']", "password123")
-    await page.click("button[type='submit']")
-    await page.wait_for_selector("#welcome")
-    await context.add_cookies([{"name": "auth_token", "value": "abc123", "url": "https://example.com"}])
-    await page.close()
-    await context.close()
-
-async def on_page_context_created(context: BrowserContext, page: Page, **kwargs):
-    print("[HOOK] on_page_context_created")
-    await context.route("**", log_routing)
-
-async def before_goto(page: Page, context: BrowserContext, **kwargs):
-    print("[HOOK] before_goto")
-    await page.set_extra_http_headers({"X-Test-Header": "test"})
-
-async def after_goto(page: Page, context: BrowserContext, **kwargs):
-    print("[HOOK] after_goto")
-    print(f"Current URL: {page.url}")
-
-async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
-    print("[HOOK] on_execution_started")
-    await page.evaluate("console.log('Custom JS executed')")
-
-async def before_return_html(page: Page, context: BrowserContext, html: str, **kwargs):
-    print("[HOOK] before_return_html")
-    print(f"HTML length: {len(html)}")
-    return page
-```
-
-### Using the Hooks with AsyncWebCrawler
-
-```python
 async def main():
-    print("\n🔗 Using Crawler Hooks: Customize AsyncWebCrawler with hooks!")
+    print("🔗 Hooks Example: Demonstrating recommended usage")
 
-    # Configure browser and crawler settings
+    # 1) Configure the browser
     browser_config = BrowserConfig(
         headless=True,
-        viewport_width=1920,
-        viewport_height=1080
+        verbose=True
     )
-    
+
+    # 2) Configure the crawler run
     crawler_run_config = CrawlerRunConfig(
         js_code="window.scrollTo(0, document.body.scrollHeight);",
-        wait_for="footer"
+        wait_for="body",
+        cache_mode=CacheMode.BYPASS
     )
 
-    # Initialize crawler
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
-        crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
-        crawler.crawler_strategy.set_hook("before_goto", before_goto)
-        crawler.crawler_strategy.set_hook("after_goto", after_goto)
-        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
-        crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
+    # 3) Create the crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
 
-        # Run the crawler
-        result = await crawler.arun(url="https://example.com", config=crawler_run_config)
+    #
+    # Define Hook Functions
+    #
 
-    print("\n📦 Crawler Hooks Result:")
-    print(result)
+    async def on_browser_created(browser, **kwargs):
+        # Called once the browser instance is created (but no pages or contexts yet)
+        print("[HOOK] on_browser_created - Browser created successfully!")
+        # Typically, do minimal setup here if needed
+        return browser
 
-asyncio.run(main())
+    async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
+        # Called right after a new page + context are created (ideal for auth or route config).
+        print("[HOOK] on_page_context_created - Setting up page & context.")
+        
+        # Example 1: Route filtering (e.g., block images)
+        async def route_filter(route):
+            if route.request.resource_type == "image":
+                print(f"[HOOK] Blocking image request: {route.request.url}")
+                await route.abort()
+            else:
+                await route.continue_()
+
+        await context.route("**", route_filter)
+
+        # Example 2: (Optional) Simulate a login scenario
+        # (We do NOT create or close pages here, just do quick steps if needed)
+        # e.g., await page.goto("https://example.com/login")
+        # e.g., await page.fill("input[name='username']", "testuser")
+        # e.g., await page.fill("input[name='password']", "password123")
+        # e.g., await page.click("button[type='submit']")
+        # e.g., await page.wait_for_selector("#welcome")
+        # e.g., await context.add_cookies([...])
+        # Then continue
+
+        # Example 3: Adjust the viewport
+        await page.set_viewport_size({"width": 1080, "height": 600})
+        return page
+
+    async def before_goto(
+        page: Page, context: BrowserContext, url: str, **kwargs
+    ):
+        # Called before navigating to each URL.
+        print(f"[HOOK] before_goto - About to navigate: {url}")
+        # e.g., inject custom headers
+        await page.set_extra_http_headers({
+            "Custom-Header": "my-value"
+        })
+        return page
+
+    async def after_goto(
+        page: Page, context: BrowserContext, 
+        url: str, response, **kwargs
+    ):
+        # Called after navigation completes.
+        print(f"[HOOK] after_goto - Successfully loaded: {url}")
+        # e.g., wait for a certain element if we want to verify
+        try:
+            await page.wait_for_selector('.content', timeout=1000)
+            print("[HOOK] Found .content element!")
+        except:
+            print("[HOOK] .content not found, continuing anyway.")
+        return page
+
+    async def on_user_agent_updated(
+        page: Page, context: BrowserContext, 
+        user_agent: str, **kwargs
+    ):
+        # Called whenever the user agent updates.
+        print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
+        return page
+
+    async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
+        # Called after custom JavaScript execution begins.
+        print("[HOOK] on_execution_started - JS code is running!")
+        return page
+
+    async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
+        # Called before final HTML retrieval.
+        print("[HOOK] before_retrieve_html - We can do final actions")
+        # Example: Scroll again
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+        return page
+
+    async def before_return_html(
+        page: Page, context: BrowserContext, html: str, **kwargs
+    ):
+        # Called just before returning the HTML in the result.
+        print(f"[HOOK] before_return_html - HTML length: {len(html)}")
+        return page
+
+    #
+    # Attach Hooks
+    #
+
+    crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
+    crawler.crawler_strategy.set_hook(
+        "on_page_context_created", on_page_context_created
+    )
+    crawler.crawler_strategy.set_hook("before_goto", before_goto)
+    crawler.crawler_strategy.set_hook("after_goto", after_goto)
+    crawler.crawler_strategy.set_hook(
+        "on_user_agent_updated", on_user_agent_updated
+    )
+    crawler.crawler_strategy.set_hook(
+        "on_execution_started", on_execution_started
+    )
+    crawler.crawler_strategy.set_hook(
+        "before_retrieve_html", before_retrieve_html
+    )
+    crawler.crawler_strategy.set_hook(
+        "before_return_html", before_return_html
+    )
+
+    await crawler.start()
+
+    # 4) Run the crawler on an example page
+    url = "https://example.com"
+    result = await crawler.arun(url, config=crawler_run_config)
+    
+    if result.success:
+        print("\nCrawled URL:", result.url)
+        print("HTML length:", len(result.html))
+    else:
+        print("Error:", result.error_message)
+
+    await crawler.close()
+
+if __name__ == "__main__":
+    asyncio.run(main())
 ```
 
-### Explanation of Hooks
+---
 
-- **`on_browser_created`**: Called when the browser is created. Use this to configure the browser or handle authentication (e.g., logging in and setting cookies).
-- **`on_page_context_created`**: Called when a new page context is created. Use this to apply routing, block resources, or inject custom logic before navigating to the URL.
-- **`before_goto`**: Called before navigating to the URL. Use this to add custom headers or perform other pre-navigation actions.
-- **`after_goto`**: Called after navigation. Use this to verify content or log the URL.
-- **`on_execution_started`**: Called after executing custom JavaScript. Use this to perform additional actions.
-- **`before_return_html`**: Called before returning the HTML content. Use this to log details or preprocess the content.
+## Hook Lifecycle Summary
 
-### Additional Customizations
+1. **`on_browser_created`**:  
+   - Browser is up, but **no** pages or contexts yet.  
+   - Light setup only—don’t try to open or close pages here (that belongs in `on_page_context_created`).
 
-- **Resource Management**: Use `on_page_context_created` to block or modify requests (e.g., block images, fonts, or third-party scripts).
-- **Dynamic Headers**: Use `before_goto` to add or modify headers dynamically based on the URL.
-- **Authentication**: Use `on_browser_created` to handle login processes and set authentication cookies or tokens.
-- **Content Analysis**: Use `before_return_html` to analyze or modify the extracted HTML content.
+2. **`on_page_context_created`**:  
+   - Perfect for advanced **auth** or route blocking.  
+   - You have a **page** + **context** ready but haven’t navigated to the target URL yet.
 
-These hooks provide powerful customization options for tailoring the crawling process to your needs.
+3. **`before_goto`**:  
+   - Right before navigation. Typically used for setting **custom headers** or logging the target URL.
+
+4. **`after_goto`**:  
+   - After page navigation is done. Good place for verifying content or waiting on essential elements. 
+
+5. **`on_user_agent_updated`**:  
+   - Whenever the user agent changes (for stealth or different UA modes).
+
+6. **`on_execution_started`**:  
+   - If you set `js_code` or run custom scripts, this runs once your JS is about to start.
+
+7. **`before_retrieve_html`**:  
+   - Just before the final HTML snapshot is taken. Often you do a final scroll or lazy-load triggers here.
+
+8. **`before_return_html`**:  
+   - The last hook before returning HTML to the `CrawlResult`. Good for logging HTML length or minor modifications.
+
+---
+
+## When to Handle Authentication
+
+**Recommended**: Use **`on_page_context_created`** if you need to:
+
+- Navigate to a login page or fill forms
+- Set cookies or localStorage tokens
+- Block resource routes to avoid ads
+
+This ensures the newly created context is under your control **before** `arun()` navigates to the main URL.
+
+---
+
+## Additional Considerations
+
+- **Session Management**: If you want multiple `arun()` calls to reuse a single session, pass `session_id=` in your `CrawlerRunConfig`. Hooks remain the same.  
+- **Performance**: Hooks can slow down crawling if they do heavy tasks. Keep them concise.  
+- **Error Handling**: If a hook fails, the overall crawl might fail. Catch exceptions or handle them gracefully.  
+- **Concurrency**: If you run `arun_many()`, each URL triggers these hooks in parallel. Ensure your hooks are thread/async-safe.
+
+---
+
+## Conclusion
+
+Hooks provide **fine-grained** control over:
+
+- **Browser** creation (light tasks only)
+- **Page** and **context** creation (auth, route blocking)
+- **Navigation** phases
+- **Final HTML** retrieval
+
+Follow the recommended usage:
+- **Login** or advanced tasks in `on_page_context_created`  
+- **Custom headers** or logs in `before_goto` / `after_goto`  
+- **Scrolling** or final checks in `before_retrieve_html` / `before_return_html`
 
diff --git a/docs/md_v2/advanced/identity-based-crawling.md b/docs/md_v2/advanced/identity-based-crawling.md
new file mode 100644
index 00000000..702d9475
--- /dev/null
+++ b/docs/md_v2/advanced/identity-based-crawling.md
@@ -0,0 +1,180 @@
+# Preserve Your Identity with Crawl4AI
+
+Crawl4AI empowers you to navigate and interact with the web using your **authentic digital identity**, ensuring you’re recognized as a human and not mistaken for a bot. This tutorial covers:
+
+1. **Managed Browsers** – The recommended approach for persistent profiles and identity-based crawling.  
+2. **Magic Mode** – A simplified fallback solution for quick automation without persistent identity.
+
+---
+
+## 1. Managed Browsers: Your Digital Identity Solution
+
+**Managed Browsers** let developers create and use **persistent browser profiles**. These profiles store local storage, cookies, and other session data, letting you browse as your **real self**—complete with logins, preferences, and cookies.
+
+### Key Benefits
+
+- **Authentic Browsing Experience**: Retain session data and browser fingerprints as though you’re a normal user.  
+- **Effortless Configuration**: Once you log in or solve CAPTCHAs in your chosen data directory, you can re-run crawls without repeating those steps.  
+- **Empowered Data Access**: If you can see the data in your own browser, you can automate its retrieval with your genuine identity.
+
+---
+
+Below is a **partial update** to your **Managed Browsers** tutorial, specifically the section about **creating a user-data directory** using **Playwright’s Chromium** binary rather than a system-wide Chrome/Edge. We’ll show how to **locate** that binary and launch it with a `--user-data-dir` argument to set up your profile. You can then point `BrowserConfig.user_data_dir` to that folder for subsequent crawls.
+
+---
+
+### Creating a User Data Directory (Command-Line Approach via Playwright)
+
+If you installed Crawl4AI (which installs Playwright under the hood), you already have a Playwright-managed Chromium on your system. Follow these steps to launch that **Chromium** from your command line, specifying a **custom** data directory:
+
+1. **Find** the Playwright Chromium binary:
+   - On most systems, installed browsers go under a `~/.cache/ms-playwright/` folder or similar path.  
+   - To see an overview of installed browsers, run:
+     ```bash
+     python -m playwright install --dry-run
+     ```
+     or
+     ```bash
+     playwright install --dry-run
+     ```
+     (depending on your environment). This shows where Playwright keeps Chromium.
+
+   - For instance, you might see a path like:
+     ```
+     ~/.cache/ms-playwright/chromium-1234/chrome-linux/chrome
+     ```
+     on Linux, or a corresponding folder on macOS/Windows.
+
+2. **Launch** the Playwright Chromium binary with a **custom** user-data directory:
+   ```bash
+   # Linux example
+   ~/.cache/ms-playwright/chromium-1234/chrome-linux/chrome \
+       --user-data-dir=/home/<you>/my_chrome_profile
+   ```
+   ```bash
+   # macOS example (Playwright’s internal binary)
+   ~/Library/Caches/ms-playwright/chromium-1234/chrome-mac/Chromium.app/Contents/MacOS/Chromium \
+       --user-data-dir=/Users/<you>/my_chrome_profile
+   ```
+   ```powershell
+   # Windows example (PowerShell/cmd)
+   "C:\Users\<you>\AppData\Local\ms-playwright\chromium-1234\chrome-win\chrome.exe" ^
+       --user-data-dir="C:\Users\<you>\my_chrome_profile"
+   ```
+   
+   **Replace** the path with the actual subfolder indicated in your `ms-playwright` cache structure.  
+   - This **opens** a fresh Chromium with your new or existing data folder.  
+   - **Log into** any sites or configure your browser the way you want.  
+   - **Close** when done—your profile data is saved in that folder.
+
+3. **Use** that folder in **`BrowserConfig.user_data_dir`**:
+   ```python
+   from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+   browser_config = BrowserConfig(
+       headless=True,
+       use_managed_browser=True,
+       user_data_dir="/home/<you>/my_chrome_profile",
+       browser_type="chromium"
+   )
+   ```
+   - Next time you run your code, it reuses that folder—**preserving** your session data, cookies, local storage, etc.
+
+---
+
+## 3. Using Managed Browsers in Crawl4AI
+
+Once you have a data directory with your session data, pass it to **`BrowserConfig`**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    # 1) Reference your persistent data directory
+    browser_config = BrowserConfig(
+        headless=True,             # 'True' for automated runs
+        verbose=True,
+        use_managed_browser=True,  # Enables persistent browser strategy
+        browser_type="chromium",
+        user_data_dir="/path/to/my-chrome-profile"
+    )
+
+    # 2) Standard crawl config
+    crawl_config = CrawlerRunConfig(
+        wait_for="css:.logged-in-content"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url="https://example.com/private", config=crawl_config)
+        if result.success:
+            print("Successfully accessed private data with your identity!")
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### Workflow
+
+1. **Login** externally (via CLI or your normal Chrome with `--user-data-dir=...`).  
+2. **Close** that browser.  
+3. **Use** the same folder in `user_data_dir=` in Crawl4AI.  
+4. **Crawl** – The site sees your identity as if you’re the same user who just logged in.
+
+---
+
+## 4. Magic Mode: Simplified Automation
+
+If you **don’t** need a persistent profile or identity-based approach, **Magic Mode** offers a quick way to simulate human-like browsing without storing long-term data.
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        config=CrawlerRunConfig(
+            magic=True,  # Simplifies a lot of interaction
+            remove_overlay_elements=True,
+            page_timeout=60000
+        )
+    )
+```
+
+**Magic Mode**:
+
+- Simulates a user-like experience  
+- Randomizes user agent & navigator
+- Randomizes interactions & timings  
+- Masks automation signals  
+- Attempts pop-up handling  
+
+**But** it’s no substitute for **true** user-based sessions if you want a fully legitimate identity-based solution.
+
+---
+
+## 5. Comparing Managed Browsers vs. Magic Mode
+
+| Feature                    | **Managed Browsers**                                           | **Magic Mode**                                     |
+|----------------------------|---------------------------------------------------------------|-----------------------------------------------------|
+| **Session Persistence**    | Full localStorage/cookies retained in user_data_dir           | No persistent data (fresh each run)                |
+| **Genuine Identity**       | Real user profile with full rights & preferences              | Emulated user-like patterns, but no actual identity |
+| **Complex Sites**          | Best for login-gated sites or heavy config                    | Simple tasks, minimal login or config needed        |
+| **Setup**                  | External creation of user_data_dir, then use in Crawl4AI       | Single-line approach (`magic=True`)                 |
+| **Reliability**            | Extremely consistent (same data across runs)                  | Good for smaller tasks, can be less stable          |
+
+---
+
+## 6. Summary
+
+- **Create** your user-data directory by launching Chrome/Chromium externally with `--user-data-dir=/some/path`.  
+- **Log in** or configure sites as needed, then close the browser.  
+- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`.  
+- Enjoy **persistent** sessions that reflect your real identity.  
+- If you only need quick, ephemeral automation, **Magic Mode** might suffice.
+
+**Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary.
+
+With these approaches, you preserve your **authentic** browsing environment, ensuring the site sees you exactly as a normal user—no repeated logins or wasted time.
\ No newline at end of file
diff --git a/docs/md_v2/advanced/identity_based_crawling.md b/docs/md_v2/advanced/identity_based_crawling.md
deleted file mode 100644
index c0ab7fd5..00000000
--- a/docs/md_v2/advanced/identity_based_crawling.md
+++ /dev/null
@@ -1,156 +0,0 @@
-### Preserve Your Identity with Crawl4AI
-
-Crawl4AI empowers you to navigate and interact with the web using your authentic digital identity, ensuring that you are recognized as a human and not mistaken for a bot. This document introduces Managed Browsers, the recommended approach for preserving your rights to access the web, and Magic Mode, a simplified solution for specific scenarios.
-
----
-
-### Managed Browsers: Your Digital Identity Solution
-
-**Managed Browsers** enable developers to create and use persistent browser profiles. These profiles store local storage, cookies, and other session-related data, allowing you to interact with websites as a recognized user. By leveraging your unique identity, Managed Browsers ensure that your experience reflects your rights as a human browsing the web.
-
-#### Why Use Managed Browsers?
-1. **Authentic Browsing Experience**: Managed Browsers retain session data and browser fingerprints, mirroring genuine user behavior.
-2. **Effortless Configuration**: Once you interact with the site using the browser (e.g., solving a CAPTCHA), the session data is saved and reused, providing seamless access.
-3. **Empowered Data Access**: By using your identity, Managed Browsers empower users to access data they can view on their own screens without artificial restrictions.
-
-#### Steps to Use Managed Browsers
-
-1. **Setup the Browser Configuration**:
-   ```python
-   from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-   from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-   browser_config = BrowserConfig(
-       headless=False,  # Set to False for initial setup to view browser actions
-       verbose=True,
-       user_agent_mode="random",
-       use_managed_browser=True,  # Enables persistent browser sessions
-       browser_type="chromium",
-       user_data_dir="/path/to/user_profile_data"  # Path to save session data
-   )
-   ```
-
-2. **Perform an Initial Run**:
-   - Run the crawler with `headless=False`.
-   - Manually interact with the site (e.g., solve CAPTCHA or log in).
-   - The browser session saves cookies, local storage, and other required data.
-
-3. **Subsequent Runs**:
-   - Switch to `headless=True` for automation.
-   - The session data is reused, allowing seamless crawling.
-
-#### Example: Extracting Data Using Managed Browsers
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-async def main():
-    # Define schema for structured data extraction
-    schema = {
-        "name": "Example Data",
-        "baseSelector": "div.example",
-        "fields": [
-            {"name": "title", "selector": "h1", "type": "text"},
-            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
-        ]
-    }
-
-    # Configure crawler
-    browser_config = BrowserConfig(
-        headless=True,  # Automate subsequent runs
-        verbose=True,
-        use_managed_browser=True,
-        user_data_dir="/path/to/user_profile_data"
-    )
-
-    crawl_config = CrawlerRunConfig(
-        extraction_strategy=JsonCssExtractionStrategy(schema),
-        wait_for="css:div.example"  # Wait for the targeted element to load
-    )
-
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(
-            url="https://example.com",
-            config=crawl_config
-        )
-
-        if result.success:
-            print("Extracted Data:", result.extracted_content)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### Benefits of Managed Browsers Over Other Methods
-Managed Browsers eliminate the need for manual detection workarounds by enabling developers to work directly with their identity and user profile data. This approach ensures maximum compatibility with websites and simplifies the crawling process while preserving your right to access data freely.
-
----
-
-### Magic Mode: Simplified Automation
-
-While Managed Browsers are the preferred approach, **Magic Mode** provides an alternative for scenarios where persistent user profiles are unnecessary or infeasible. Magic Mode automates user-like behavior and simplifies configuration.
-
-#### What Magic Mode Does:
-- Simulates human browsing by randomizing interaction patterns and timing.
-- Masks browser automation signals.
-- Handles cookie popups and modals.
-- Modifies navigator properties for enhanced compatibility.
-
-#### Using Magic Mode
-
-```python
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun(
-        url="https://example.com",
-        magic=True  # Enables all automation features
-    )
-```
-
-Magic Mode is particularly useful for:
-- Quick prototyping when a Managed Browser setup is not available.
-- Basic sites requiring minimal interaction or configuration.
-
-#### Example: Combining Magic Mode with Additional Options
-
-```python
-async def crawl_with_magic_mode(url: str):
-    async with AsyncWebCrawler(headless=True) as crawler:
-        result = await crawler.arun(
-            url=url,
-            magic=True,
-            remove_overlay_elements=True,  # Remove popups/modals
-            page_timeout=60000            # Increased timeout for complex pages
-        )
-
-        return result.markdown if result.success else None
-```
-
-### Magic Mode vs. Managed Browsers
-While Magic Mode simplifies many tasks, it cannot match the reliability and authenticity of Managed Browsers. By using your identity and persistent profiles, Managed Browsers render Magic Mode largely unnecessary. However, Magic Mode remains a viable fallback for specific situations where user identity is not a factor.
-
----
-
-### Key Comparison: Managed Browsers vs. Magic Mode
-
-| Feature                 | **Managed Browsers**                     | **Magic Mode**                     |
-|-------------------------|------------------------------------------|-------------------------------------|
-| **Session Persistence** | Retains cookies and local storage.       | No session retention.              |
-| **Human Interaction**   | Uses real user profiles and data.        | Simulates human-like patterns.     |
-| **Complex Sites**        | Best suited for heavily configured sites.| Works well with simpler challenges.|
-| **Setup Complexity**    | Requires initial manual interaction.     | Fully automated, one-line setup.   |
-
-#### Recommendation:
-- Use **Managed Browsers** for reliable, session-based crawling and data extraction.
-- Use **Magic Mode** for quick prototyping or when persistent profiles are not required.
-
----
-
-### Conclusion
-
-- **Use Managed Browsers** to preserve your digital identity and ensure reliable, identity-based crawling with persistent sessions. This approach works seamlessly for even the most complex websites.
-- **Leverage Magic Mode** for quick automation or in scenarios where persistent user profiles are not needed.
-
-By combining these approaches, Crawl4AI provides unparalleled flexibility and capability for your crawling needs.
-
diff --git a/docs/md_v2/advanced/lazy-loading.md b/docs/md_v2/advanced/lazy-loading.md
new file mode 100644
index 00000000..04688264
--- /dev/null
+++ b/docs/md_v2/advanced/lazy-loading.md
@@ -0,0 +1,104 @@
+## Handling Lazy-Loaded Images
+
+Many websites now load images **lazily** as you scroll. If you need to ensure they appear in your final crawl (and in `result.media`), consider:
+
+1. **`wait_for_images=True`** – Wait for images to fully load.  
+2. **`scan_full_page`** – Force the crawler to scroll the entire page, triggering lazy loads.  
+3. **`scroll_delay`** – Add small delays between scroll steps.  
+
+**Note**: If the site requires multiple “Load More” triggers or complex interactions, see the [Page Interaction docs](../core/page-interaction.md).
+
+### Example: Ensuring Lazy Images Appear
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
+from crawl4ai.async_configs import CacheMode
+
+async def main():
+    config = CrawlerRunConfig(
+        # Force the crawler to wait until images are fully loaded
+        wait_for_images=True,
+
+        # Option 1: If you want to automatically scroll the page to load images
+        scan_full_page=True,  # Tells the crawler to try scrolling the entire page
+        scroll_delay=0.5,     # Delay (seconds) between scroll steps
+
+        # Option 2: If the site uses a 'Load More' or JS triggers for images,
+        # you can also specify js_code or wait_for logic here.
+
+        cache_mode=CacheMode.BYPASS,
+        verbose=True
+    )
+
+    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
+        result = await crawler.arun("https://www.example.com/gallery", config=config)
+        
+        if result.success:
+            images = result.media.get("images", [])
+            print("Images found:", len(images))
+            for i, img in enumerate(images[:5]):
+                print(f"[Image {i}] URL: {img['src']}, Score: {img.get('score','N/A')}")
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Explanation**:
+
+- **`wait_for_images=True`**  
+  The crawler tries to ensure images have finished loading before finalizing the HTML.  
+- **`scan_full_page=True`**  
+  Tells the crawler to attempt scrolling from top to bottom. Each scroll step helps trigger lazy loading.  
+- **`scroll_delay=0.5`**  
+  Pause half a second between each scroll step. Helps the site load images before continuing.
+
+**When to Use**:
+
+- **Lazy-Loading**: If images appear only when the user scrolls into view, `scan_full_page` + `scroll_delay` helps the crawler see them.  
+- **Heavier Pages**: If a page is extremely long, be mindful that scanning the entire page can be slow. Adjust `scroll_delay` or the max scroll steps as needed.
+
+---
+
+## Combining with Other Link & Media Filters
+
+You can still combine **lazy-load** logic with the usual **exclude_external_images**, **exclude_domains**, or link filtration:
+
+```python
+config = CrawlerRunConfig(
+    wait_for_images=True,
+    scan_full_page=True,
+    scroll_delay=0.5,
+
+    # Filter out external images if you only want local ones
+    exclude_external_images=True,
+
+    # Exclude certain domains for links
+    exclude_domains=["spammycdn.com"],
+)
+```
+
+This approach ensures you see **all** images from the main domain while ignoring external ones, and the crawler physically scrolls the entire page so that lazy-loading triggers.
+
+---
+
+## Tips & Troubleshooting
+
+1. **Long Pages**  
+   - Setting `scan_full_page=True` on extremely long or infinite-scroll pages can be resource-intensive.  
+   - Consider using [hooks](../core/page-interaction.md) or specialized logic to load specific sections or “Load More” triggers repeatedly.
+
+2. **Mixed Image Behavior**  
+   - Some sites load images in batches as you scroll. If you’re missing images, increase your `scroll_delay` or call multiple partial scrolls in a loop with JS code or hooks.
+
+3. **Combining with Dynamic Wait**  
+   - If the site has a placeholder that only changes to a real image after a certain event, you might do `wait_for="css:img.loaded"` or a custom JS `wait_for`.
+
+4. **Caching**  
+   - If `cache_mode` is enabled, repeated crawls might skip some network fetches. If you suspect caching is missing new images, set `cache_mode=CacheMode.BYPASS` for fresh fetches.
+
+---
+
+With **lazy-loading** support, **wait_for_images**, and **scan_full_page** settings, you can capture the entire gallery or feed of images you expect—even if the site only loads them as the user scrolls. Combine these with the standard media filtering and domain exclusion for a complete link & media handling strategy.
\ No newline at end of file
diff --git a/docs/md_v2/advanced/magic-mode.md b/docs/md_v2/advanced/magic-mode.md
deleted file mode 100644
index 16c7229e..00000000
--- a/docs/md_v2/advanced/magic-mode.md
+++ /dev/null
@@ -1,52 +0,0 @@
-# Magic Mode & Anti-Bot Protection
-
-Crawl4AI provides powerful anti-detection capabilities, with Magic Mode being the simplest and most comprehensive solution.
-
-## Magic Mode
-
-The easiest way to bypass anti-bot protections:
-
-```python
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun(
-        url="https://example.com",
-        magic=True  # Enables all anti-detection features
-    )
-```
-
-Magic Mode automatically:
-- Masks browser automation signals
-- Simulates human-like behavior
-- Overrides navigator properties
-- Handles cookie consent popups
-- Manages browser fingerprinting
-- Randomizes timing patterns
-
-## Manual Anti-Bot Options
-
-While Magic Mode is recommended, you can also configure individual anti-detection features:
-
-```python
-result = await crawler.arun(
-    url="https://example.com",
-    simulate_user=True,        # Simulate human behavior
-    override_navigator=True    # Mask automation signals
-)
-```
-
-Note: When `magic=True` is used, you don't need to set these individual options.
-
-## Example: Handling Protected Sites
-
-```python
-async def crawl_protected_site(url: str):
-    async with AsyncWebCrawler(headless=True) as crawler:
-        result = await crawler.arun(
-            url=url,
-            magic=True,
-            remove_overlay_elements=True,  # Remove popups/modals
-            page_timeout=60000            # Increased timeout for protection checks
-        )
-        
-        return result.markdown if result.success else None
-```
diff --git a/docs/md_v2/advanced/managed_browser.md b/docs/md_v2/advanced/managed_browser.md
deleted file mode 100644
index bbe07f2f..00000000
--- a/docs/md_v2/advanced/managed_browser.md
+++ /dev/null
@@ -1,188 +0,0 @@
-#  Creating Browser Instances, Contexts, and Pages
-
-## 1 Introduction
-
-### Overview of Browser Management in Crawl4AI
-Crawl4AI's browser management system is designed to provide developers with advanced tools for handling complex web crawling tasks. By managing browser instances, contexts, and pages, Crawl4AI ensures optimal performance, anti-bot measures, and session persistence for high-volume, dynamic web crawling.
-
-### Key Objectives
-- **Anti-Bot Handling**:
-  - Implements stealth techniques to evade detection mechanisms used by modern websites.
-  - Simulates human-like behavior, such as mouse movements, scrolling, and key presses.
-  - Supports integration with third-party services to bypass CAPTCHA challenges.
-- **Persistent Sessions**:
-  - Retains session data (cookies, local storage) for workflows requiring user authentication.
-  - Allows seamless continuation of tasks across multiple runs without re-authentication.
-- **Scalable Crawling**:
-  - Optimized resource utilization for handling thousands of URLs concurrently.
-  - Flexible configuration options to tailor crawling behavior to specific requirements.
-
----
-
-## 2 Browser Creation Methods
-
-### Standard Browser Creation
-Standard browser creation initializes a browser instance with default or minimal configurations. It is suitable for tasks that do not require session persistence or heavy customization.
-
-#### Features and Limitations
-- **Features**:
-  - Quick and straightforward setup for small-scale tasks.
-  - Supports headless and headful modes.
-- **Limitations**:
-  - Lacks advanced customization options like session reuse.
-  - May struggle with sites employing strict anti-bot measures.
-
-#### Example Usage
-```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig
-
-browser_config = BrowserConfig(browser_type="chromium", headless=True)
-async with AsyncWebCrawler(config=browser_config) as crawler:
-    result = await crawler.arun("https://crawl4ai.com")
-    print(result.markdown)
-```
-
-### Persistent Contexts
-Persistent contexts create browser sessions with stored data, enabling workflows that require maintaining login states or other session-specific information.
-
-#### Benefits of Using `user_data_dir`
-- **Session Persistence**:
-  - Stores cookies, local storage, and cache between crawling sessions.
-  - Reduces overhead for repetitive logins or multi-step workflows.
-- **Enhanced Performance**:
-  - Leverages pre-loaded resources for faster page loading.
-- **Flexibility**:
-  - Adapts to complex workflows requiring user-specific configurations.
-
-#### Example: Setting Up Persistent Contexts
-```python
-config = BrowserConfig(user_data_dir="/path/to/user/data")
-async with AsyncWebCrawler(config=config) as crawler:
-    result = await crawler.arun("https://crawl4ai.com")
-    print(result.markdown)
-```
-
-### Managed Browser
-The `ManagedBrowser` class offers a high-level abstraction for managing browser instances, emphasizing resource management, debugging capabilities, and anti-bot measures.
-
-#### How It Works
-- **Browser Process Management**:
-  - Automates initialization and cleanup of browser processes.
-  - Optimizes resource usage by pooling and reusing browser instances.
-- **Debugging Support**:
-  - Integrates with debugging tools like Chrome Developer Tools for real-time inspection.
-- **Anti-Bot Measures**:
-  - Implements stealth plugins to mimic real user behavior and bypass bot detection.
-
-#### Features
-- **Customizable Configurations**:
-  - Supports advanced options such as viewport resizing, proxy settings, and header manipulation.
-- **Debugging and Logging**:
-  - Logs detailed browser interactions for debugging and performance analysis.
-- **Scalability**:
-  - Handles multiple browser instances concurrently, scaling dynamically based on workload.
-
-#### Example: Using `ManagedBrowser`
-```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig
-
-config = BrowserConfig(headless=False, debug_port=9222)
-async with AsyncWebCrawler(config=config) as crawler:
-    result = await crawler.arun("https://crawl4ai.com")
-    print(result.markdown)
-```
-
----
-
-## 3 Context and Page Management
-
-### Creating and Configuring Browser Contexts
-Browser contexts act as isolated environments within a single browser instance, enabling independent browsing sessions with their own cookies, cache, and storage.
-
-#### Customizations
-- **Headers and Cookies**:
-  - Define custom headers to mimic specific devices or browsers.
-  - Set cookies for authenticated sessions.
-- **Session Reuse**:
-  - Retain and reuse session data across multiple requests.
-  - Example: Preserve login states for authenticated crawls.
-
-#### Example: Context Initialization
-```python
-from crawl4ai import CrawlerRunConfig
-
-config = CrawlerRunConfig(headers={"User-Agent": "Crawl4AI/1.0"})
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun("https://crawl4ai.com", config=config)
-    print(result.markdown)
-```
-
-### Creating Pages
-Pages represent individual tabs or views within a browser context. They are responsible for rendering content, executing JavaScript, and handling user interactions.
-
-#### Key Features
-- **IFrame Handling**:
-  - Extract content from embedded iframes.
-  - Navigate and interact with nested content.
-- **Viewport Customization**:
-  - Adjust viewport size to match target device dimensions.
-- **Lazy Loading**:
-  - Ensure dynamic elements are fully loaded before extraction.
-
-#### Example: Page Initialization
-```python
-config = CrawlerRunConfig(viewport_width=1920, viewport_height=1080)
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun("https://crawl4ai.com", config=config)
-    print(result.markdown)
-```
-
----
-
-## 4 Advanced Features and Best Practices
-
-### Debugging and Logging
-Remote debugging provides a powerful way to troubleshoot complex crawling workflows.
-
-#### Example: Enabling Remote Debugging
-```python
-config = BrowserConfig(debug_port=9222)
-async with AsyncWebCrawler(config=config) as crawler:
-    result = await crawler.arun("https://crawl4ai.com")
-```
-
-### Anti-Bot Techniques
-- **Human Behavior Simulation**:
-  - Mimic real user actions, such as scrolling, clicking, and typing.
-  - Example: Use JavaScript to simulate interactions.
-- **Captcha Handling**:
-  - Integrate with third-party services like 2Captcha or AntiCaptcha for automated solving.
-
-#### Example: Simulating User Actions
-```python
-js_code = """
-(async () => {
-    document.querySelector('input[name="search"]').value = 'test';
-    document.querySelector('button[type="submit"]').click();
-})();
-"""
-config = CrawlerRunConfig(js_code=[js_code])
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun("https://crawl4ai.com", config=config)
-```
-
-### Optimizations for Performance and Scalability
-- **Persistent Contexts**:
-  - Reuse browser contexts to minimize resource consumption.
-- **Concurrent Crawls**:
-  - Use `arun_many` with a controlled semaphore count for efficient batch processing.
-
-#### Example: Scaling Crawls
-```python
-urls = ["https://example1.com", "https://example2.com"]
-config = CrawlerRunConfig(semaphore_count=10)
-async with AsyncWebCrawler() as crawler:
-    results = await crawler.arun_many(urls, config=config)
-    for result in results:
-        print(result.url, result.markdown)
-```
diff --git a/docs/md_v2/advanced/multi-url-crawling copy.md b/docs/md_v2/advanced/multi-url-crawling copy.md
new file mode 100644
index 00000000..a1d2b423
--- /dev/null
+++ b/docs/md_v2/advanced/multi-url-crawling copy.md	
@@ -0,0 +1,264 @@
+# Optimized Multi-URL Crawling
+
+> **Note**: We’re developing a new **executor module** that uses a sophisticated algorithm to dynamically manage multi-URL crawling, optimizing for speed and memory usage. The approaches in this document remain fully valid, but keep an eye on **Crawl4AI**’s upcoming releases for this powerful feature! Follow [@unclecode](https://twitter.com/unclecode) on X and check the changelogs to stay updated.
+
+
+Crawl4AI’s **AsyncWebCrawler** can handle multiple URLs in a single run, which can greatly reduce overhead and speed up crawling. This guide shows how to:
+
+1. **Sequentially** crawl a list of URLs using the **same** session, avoiding repeated browser creation.  
+2. **Parallel**-crawl subsets of URLs in batches, again reusing the same browser.  
+
+When the entire process finishes, you close the browser once—**minimizing** memory and resource usage.
+
+---
+
+## 1. Why Avoid Simple Loops per URL?
+
+If you naively do:
+
+```python
+for url in urls:
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url)
+```
+
+You end up:
+
+1. Spinning up a **new** browser for each URL  
+2. Closing it immediately after the single crawl  
+3. Potentially using a lot of CPU/memory for short-living browsers  
+4. Missing out on session reusability if you have login or ongoing states
+
+**Better** approaches ensure you **create** the browser once, then crawl multiple URLs with minimal overhead.
+
+---
+
+## 2. Sequential Crawling with Session Reuse
+
+### 2.1 Overview
+
+1. **One** `AsyncWebCrawler` instance for **all** URLs.  
+2. **One** session (via `session_id`) so we can preserve local storage or cookies across URLs if needed.  
+3. The crawler is only closed at the **end**.
+
+**This** is the simplest pattern if your workload is moderate (dozens to a few hundred URLs).
+
+### 2.2 Example Code
+
+```python
+import asyncio
+from typing import List
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def crawl_sequential(urls: List[str]):
+    print("\n=== Sequential Crawling with Session Reuse ===")
+
+    browser_config = BrowserConfig(
+        headless=True,
+        # For better performance in Docker or low-memory environments:
+        extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
+    )
+
+    crawl_config = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator()
+    )
+
+    # Create the crawler (opens the browser)
+    crawler = AsyncWebCrawler(config=browser_config)
+    await crawler.start()
+
+    try:
+        session_id = "session1"  # Reuse the same session across all URLs
+        for url in urls:
+            result = await crawler.arun(
+                url=url,
+                config=crawl_config,
+                session_id=session_id
+            )
+            if result.success:
+                print(f"Successfully crawled: {url}")
+                # E.g. check markdown length
+                print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
+            else:
+                print(f"Failed: {url} - Error: {result.error_message}")
+    finally:
+        # After all URLs are done, close the crawler (and the browser)
+        await crawler.close()
+
+async def main():
+    urls = [
+        "https://example.com/page1",
+        "https://example.com/page2",
+        "https://example.com/page3"
+    ]
+    await crawl_sequential(urls)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Why It’s Good**:
+
+- **One** browser launch.  
+- Minimal memory usage.  
+- If the site requires login, you can log in once in `session_id` context and preserve auth across all URLs.
+
+---
+
+## 3. Parallel Crawling with Browser Reuse
+
+### 3.1 Overview
+
+To speed up crawling further, you can crawl multiple URLs in **parallel** (batches or a concurrency limit). The crawler still uses **one** browser, but spawns different sessions (or the same, depending on your logic) for each task.
+
+### 3.2 Example Code
+
+For this example make sure to install the [psutil](https://pypi.org/project/psutil/) package.
+
+```bash
+pip install psutil
+```
+
+Then you can run the following code:
+
+```python
+import os
+import sys
+import psutil
+import asyncio
+
+__location__ = os.path.dirname(os.path.abspath(__file__))
+__output__ = os.path.join(__location__, "output")
+
+# Append parent directory to system path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+
+from typing import List
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
+    print("\n=== Parallel Crawling with Browser Reuse + Memory Check ===")
+
+    # We'll keep track of peak memory usage across all tasks
+    peak_memory = 0
+    process = psutil.Process(os.getpid())
+
+    def log_memory(prefix: str = ""):
+        nonlocal peak_memory
+        current_mem = process.memory_info().rss  # in bytes
+        if current_mem > peak_memory:
+            peak_memory = current_mem
+        print(f"{prefix} Current Memory: {current_mem // (1024 * 1024)} MB, Peak: {peak_memory // (1024 * 1024)} MB")
+
+    # Minimal browser config
+    browser_config = BrowserConfig(
+        headless=True,
+        verbose=False,   # corrected from 'verbos=False'
+        extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
+    )
+    crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    # Create the crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+    await crawler.start()
+
+    try:
+        # We'll chunk the URLs in batches of 'max_concurrent'
+        success_count = 0
+        fail_count = 0
+        for i in range(0, len(urls), max_concurrent):
+            batch = urls[i : i + max_concurrent]
+            tasks = []
+
+            for j, url in enumerate(batch):
+                # Unique session_id per concurrent sub-task
+                session_id = f"parallel_session_{i + j}"
+                task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
+                tasks.append(task)
+
+            # Check memory usage prior to launching tasks
+            log_memory(prefix=f"Before batch {i//max_concurrent + 1}: ")
+
+            # Gather results
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+            # Check memory usage after tasks complete
+            log_memory(prefix=f"After batch {i//max_concurrent + 1}: ")
+
+            # Evaluate results
+            for url, result in zip(batch, results):
+                if isinstance(result, Exception):
+                    print(f"Error crawling {url}: {result}")
+                    fail_count += 1
+                elif result.success:
+                    success_count += 1
+                else:
+                    fail_count += 1
+
+        print(f"\nSummary:")
+        print(f"  - Successfully crawled: {success_count}")
+        print(f"  - Failed: {fail_count}")
+
+    finally:
+        print("\nClosing crawler...")
+        await crawler.close()
+        # Final memory log
+        log_memory(prefix="Final: ")
+        print(f"\nPeak memory usage (MB): {peak_memory // (1024 * 1024)}")
+
+async def main():
+    urls = [
+        "https://example.com/page1",
+        "https://example.com/page2",
+        "https://example.com/page3",
+        "https://example.com/page4"
+    ]
+    await crawl_parallel(urls, max_concurrent=2)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+**Notes**:
+
+- We **reuse** the same `AsyncWebCrawler` instance for all parallel tasks, launching **one** browser.  
+- Each parallel sub-task might get its own `session_id` so they don’t share cookies/localStorage (unless that’s desired).  
+- We limit concurrency to `max_concurrent=2` or 3 to avoid saturating CPU/memory.
+
+---
+
+## 4. Performance Tips
+
+1. **Extra Browser Args**  
+   - `--disable-gpu`, `--no-sandbox` can help in Docker or restricted environments.  
+   - `--disable-dev-shm-usage` avoids using `/dev/shm` which can be small on some systems.
+
+2. **Session Reuse**  
+   - If your site requires a login or you want to maintain local data across URLs, share the **same** `session_id`.  
+   - If you want isolation (each URL fresh), create unique sessions.
+
+3. **Batching**  
+   - If you have **many** URLs (like thousands), you can do parallel crawling in chunks (like `max_concurrent=5`).  
+   - Use `arun_many()` for a built-in approach if you prefer, but the example above is often more flexible.
+
+4. **Cache**  
+   - If your pages share many resources or you’re re-crawling the same domain repeatedly, consider setting `cache_mode=CacheMode.ENABLED` in `CrawlerRunConfig`.  
+   - If you need fresh data each time, keep `cache_mode=CacheMode.BYPASS`.
+
+5. **Hooks**  
+   - You can set up global hooks for each crawler (like to block images) or per-run if you want.  
+   - Keep them consistent if you’re reusing sessions.
+
+---
+
+## 5. Summary
+
+- **One** `AsyncWebCrawler` + multiple calls to `.arun()` is far more efficient than launching a new crawler per URL.  
+- **Sequential** approach with a shared session is simple and memory-friendly for moderate sets of URLs.  
+- **Parallel** approach can speed up large crawls by concurrency, but keep concurrency balanced to avoid overhead.  
+- Close the crawler once at the end, ensuring the browser is only opened/closed once.
+
+For even more advanced memory optimizations or dynamic concurrency patterns, see future sections on hooking or distributed crawling. The patterns above suffice for the majority of multi-URL scenarios—**giving you speed, simplicity, and minimal resource usage**. Enjoy your optimized crawling!
\ No newline at end of file
diff --git a/docs/md_v2/advanced/multi-url-crawling.md b/docs/md_v2/advanced/multi-url-crawling.md
new file mode 100644
index 00000000..cae789a2
--- /dev/null
+++ b/docs/md_v2/advanced/multi-url-crawling.md
@@ -0,0 +1,236 @@
+# Advanced Multi-URL Crawling with Dispatchers
+
+> **Heads Up**: Crawl4AI supports advanced dispatchers for **parallel** or **throttled** crawling, providing dynamic rate limiting and memory usage checks. The built-in `arun_many()` function uses these dispatchers to handle concurrency efficiently.
+
+## 1. Introduction
+
+When crawling many URLs:
+- **Basic**: Use `arun()` in a loop (simple but less efficient)
+- **Better**: Use `arun_many()`, which efficiently handles multiple URLs with proper concurrency control
+- **Best**: Customize dispatcher behavior for your specific needs (memory management, rate limits, etc.)
+
+**Why Dispatchers?**  
+- **Adaptive**: Memory-based dispatchers can pause or slow down based on system resources
+- **Rate-limiting**: Built-in rate limiting with exponential backoff for 429/503 responses
+- **Real-time Monitoring**: Live dashboard of ongoing tasks, memory usage, and performance
+- **Flexibility**: Choose between memory-adaptive or semaphore-based concurrency
+
+## 2. Core Components
+
+### 2.1 Rate Limiter
+
+```python
+class RateLimiter:
+    def __init__(
+        base_delay: Tuple[float, float] = (1.0, 3.0),  # Random delay range between requests
+        max_delay: float = 60.0,                        # Maximum backoff delay
+        max_retries: int = 3,                          # Retries before giving up
+        rate_limit_codes: List[int] = [429, 503]       # Status codes triggering backoff
+    )
+```
+
+The RateLimiter provides:
+- Random delays between requests
+- Exponential backoff on rate limit responses
+- Domain-specific rate limiting
+- Automatic retry handling
+
+### 2.2 Crawler Monitor
+
+The CrawlerMonitor provides real-time visibility into crawling operations:
+
+```python
+monitor = CrawlerMonitor(
+    max_visible_rows=15,           # Maximum rows in live display
+    display_mode=DisplayMode.DETAILED  # DETAILED or AGGREGATED view
+)
+```
+
+**Display Modes**:
+1. **DETAILED**: Shows individual task status, memory usage, and timing
+2. **AGGREGATED**: Displays summary statistics and overall progress
+
+## 3. Available Dispatchers
+
+### 3.1 MemoryAdaptiveDispatcher (Default)
+
+Automatically manages concurrency based on system memory usage:
+
+```python
+dispatcher = MemoryAdaptiveDispatcher(
+    memory_threshold_percent=70.0,  # Pause if memory exceeds this
+    check_interval=1.0,             # How often to check memory
+    max_session_permit=10,          # Maximum concurrent tasks
+    rate_limiter=RateLimiter(       # Optional rate limiting
+        base_delay=(1.0, 2.0),
+        max_delay=30.0,
+        max_retries=2
+    ),
+    monitor=CrawlerMonitor(         # Optional monitoring
+        max_visible_rows=15,
+        display_mode=DisplayMode.DETAILED
+    )
+)
+```
+
+### 3.2 SemaphoreDispatcher
+
+Provides simple concurrency control with a fixed limit:
+
+```python
+dispatcher = SemaphoreDispatcher(
+    semaphore_count=5,             # Fixed concurrent tasks
+    rate_limiter=RateLimiter(      # Optional rate limiting
+        base_delay=(0.5, 1.0),
+        max_delay=10.0
+    ),
+    monitor=CrawlerMonitor(        # Optional monitoring
+        max_visible_rows=15,
+        display_mode=DisplayMode.DETAILED
+    )
+)
+```
+
+## 4. Usage Examples
+
+### 4.1 Batch Processing (Default)
+
+```python
+async def crawl_batch():
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=False  # Default: get all results at once
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=10,
+        monitor=CrawlerMonitor(
+            display_mode=DisplayMode.DETAILED
+        )
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Get all results at once
+        results = await crawler.arun_many(
+            urls=urls,
+            config=run_config,
+            dispatcher=dispatcher
+        )
+        
+        # Process all results after completion
+        for result in results:
+            if result.success:
+                await process_result(result)
+            else:
+                print(f"Failed to crawl {result.url}: {result.error_message}")
+```
+
+### 4.2 Streaming Mode
+
+```python
+async def crawl_streaming():
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=True  # Enable streaming mode
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=10,
+        monitor=CrawlerMonitor(
+            display_mode=DisplayMode.DETAILED
+        )
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Process results as they become available
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=run_config,
+            dispatcher=dispatcher
+        ):
+            if result.success:
+                # Process each result immediately
+                await process_result(result)
+            else:
+                print(f"Failed to crawl {result.url}: {result.error_message}")
+```
+
+### 4.3 Semaphore-based Crawling
+
+```python
+async def crawl_with_semaphore(urls):
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+    
+    dispatcher = SemaphoreDispatcher(
+        semaphore_count=5,
+        rate_limiter=RateLimiter(
+            base_delay=(0.5, 1.0),
+            max_delay=10.0
+        ),
+        monitor=CrawlerMonitor(
+            max_visible_rows=15,
+            display_mode=DisplayMode.DETAILED
+        )
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        results = await crawler.arun_many(
+            urls, 
+            config=run_config,
+            dispatcher=dispatcher
+        )
+        return results
+```
+
+## 5. Dispatch Results
+
+Each crawl result includes dispatch information:
+
+```python
+@dataclass
+class DispatchResult:
+    task_id: str
+    memory_usage: float
+    peak_memory: float
+    start_time: datetime
+    end_time: datetime
+    error_message: str = ""
+```
+
+Access via `result.dispatch_result`:
+
+```python
+for result in results:
+    if result.success:
+        dr = result.dispatch_result
+        print(f"URL: {result.url}")
+        print(f"Memory: {dr.memory_usage:.1f}MB")
+        print(f"Duration: {dr.end_time - dr.start_time}")
+```
+
+## 6. Summary
+
+1. **Two Dispatcher Types**:
+   - MemoryAdaptiveDispatcher (default): Dynamic concurrency based on memory
+   - SemaphoreDispatcher: Fixed concurrency limit
+
+2. **Optional Components**:
+   - RateLimiter: Smart request pacing and backoff
+   - CrawlerMonitor: Real-time progress visualization
+
+3. **Key Benefits**:
+   - Automatic memory management
+   - Built-in rate limiting
+   - Live progress monitoring
+   - Flexible concurrency control
+
+Choose the dispatcher that best fits your needs:
+- **MemoryAdaptiveDispatcher**: For large crawls or limited resources
+- **SemaphoreDispatcher**: For simple, fixed-concurrency scenarios
diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md
index 8989777b..b98c17e5 100644
--- a/docs/md_v2/advanced/proxy-security.md
+++ b/docs/md_v2/advanced/proxy-security.md
@@ -1,6 +1,4 @@
-# Proxy & Security
-
-Configure proxy settings and enhance security features in Crawl4AI for reliable data extraction.
+# Proxy 
 
 ## Basic Proxy Setup
 
@@ -58,38 +56,3 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
         result = await crawler.arun(url=url, config=browser_config)
 ```
 
-## Custom Headers
-
-Add security-related headers via `BrowserConfig`:
-
-```python
-from crawl4ai.async_configs import BrowserConfig
-
-headers = {
-    "X-Forwarded-For": "203.0.113.195",
-    "Accept-Language": "en-US,en;q=0.9",
-    "Cache-Control": "no-cache",
-    "Pragma": "no-cache"
-}
-
-browser_config = BrowserConfig(headers=headers)
-async with AsyncWebCrawler(config=browser_config) as crawler:
-    result = await crawler.arun(url="https://example.com")
-```
-
-## Combining with Magic Mode
-
-For maximum protection, combine proxy with Magic Mode via `CrawlerRunConfig` and `BrowserConfig`:
-
-```python
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
-
-browser_config = BrowserConfig(
-    proxy="http://proxy.example.com:8080",
-    headers={"Accept-Language": "en-US"}
-)
-crawler_config = CrawlerRunConfig(magic=True)  # Enable all anti-detection features
-
-async with AsyncWebCrawler(config=browser_config) as crawler:
-    result = await crawler.arun(url="https://example.com", config=crawler_config)
-```
diff --git a/docs/md_v2/advanced/session-management-advanced.md b/docs/md_v2/advanced/session-management-advanced.md
deleted file mode 100644
index ba1ae0a0..00000000
--- a/docs/md_v2/advanced/session-management-advanced.md
+++ /dev/null
@@ -1,179 +0,0 @@
-### Session-Based Crawling for Dynamic Content
-
-In modern web applications, content is often loaded dynamically without changing the URL. Examples include "Load More" buttons, infinite scrolling, or paginated content that updates via JavaScript. Crawl4AI provides session-based crawling capabilities to handle such scenarios effectively.
-
-This guide explores advanced techniques for crawling dynamic content using Crawl4AI's session management features.
-
----
-
-## Understanding Session-Based Crawling
-
-Session-based crawling allows you to reuse a persistent browser session across multiple actions. This means the same browser tab (or page object) is used throughout, enabling:
-
-1. **Efficient handling of dynamic content** without reloading the page.
-2. **JavaScript actions before and after crawling** (e.g., clicking buttons or scrolling).
-3. **State maintenance** for authenticated sessions or multi-step workflows.
-4. **Faster sequential crawling**, as it avoids reopening tabs or reallocating resources.
-
-**Note:** Session-based crawling is ideal for sequential operations, not parallel tasks.
-
----
-
-## Basic Concepts
-
-Before diving into examples, here are some key concepts:
-
-- **Session ID**: A unique identifier for a browsing session. Use the same `session_id` across multiple requests to maintain state.
-- **BrowserConfig & CrawlerRunConfig**: These configuration objects control browser settings and crawling behavior.
-- **JavaScript Execution**: Use `js_code` to perform actions like clicking buttons.
-- **CSS Selectors**: Target specific elements for interaction or data extraction.
-- **Extraction Strategy**: Define rules to extract structured data.
-- **Wait Conditions**: Specify conditions to wait for before proceeding.
-
----
-
-## Example 1: Basic Session-Based Crawling
-
-A simple example using session-based crawling:
-
-```python
-import asyncio
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
-from crawl4ai.cache_context import CacheMode
-
-async def basic_session_crawl():
-    async with AsyncWebCrawler() as crawler:
-        session_id = "dynamic_content_session"
-        url = "https://example.com/dynamic-content"
-
-        for page in range(3):
-            config = CrawlerRunConfig(
-                url=url,
-                session_id=session_id,
-                js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
-                css_selector=".content-item",
-                cache_mode=CacheMode.BYPASS
-            )
-            
-            result = await crawler.arun(config=config)
-            print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-
-asyncio.run(basic_session_crawl())
-```
-
-This example shows:
-1. Reusing the same `session_id` across multiple requests.
-2. Executing JavaScript to load more content dynamically.
-3. Properly closing the session to free resources.
-
----
-
-## Advanced Technique 1: Custom Execution Hooks
-
-Use custom hooks to handle complex scenarios, such as waiting for content to load dynamically:
-
-```python
-async def advanced_session_crawl_with_hooks():
-    first_commit = ""
-
-    async def on_execution_started(page):
-        nonlocal first_commit
-        try:
-            while True:
-                await page.wait_for_selector("li.commit-item h4")
-                commit = await page.query_selector("li.commit-item h4")
-                commit = await commit.evaluate("(element) => element.textContent").strip()
-                if commit and commit != first_commit:
-                    first_commit = commit
-                    break
-                await asyncio.sleep(0.5)
-        except Exception as e:
-            print(f"Warning: New content didn't appear: {e}")
-
-    async with AsyncWebCrawler() as crawler:
-        session_id = "commit_session"
-        url = "https://github.com/example/repo/commits/main"
-        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
-
-        js_next_page = """document.querySelector('a.pagination-next').click();"""
-
-        for page in range(3):
-            config = CrawlerRunConfig(
-                url=url,
-                session_id=session_id,
-                js_code=js_next_page if page > 0 else None,
-                css_selector="li.commit-item",
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS
-            )
-
-            result = await crawler.arun(config=config)
-            print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-
-asyncio.run(advanced_session_crawl_with_hooks())
-```
-
-This technique ensures new content loads before the next action.
-
----
-
-## Advanced Technique 2: Integrated JavaScript Execution and Waiting
-
-Combine JavaScript execution and waiting logic for concise handling of dynamic content:
-
-```python
-async def integrated_js_and_wait_crawl():
-    async with AsyncWebCrawler() as crawler:
-        session_id = "integrated_session"
-        url = "https://github.com/example/repo/commits/main"
-
-        js_next_page_and_wait = """
-        (async () => {
-            const getCurrentCommit = () => document.querySelector('li.commit-item h4').textContent.trim();
-            const initialCommit = getCurrentCommit();
-            document.querySelector('a.pagination-next').click();
-            while (getCurrentCommit() === initialCommit) {
-                await new Promise(resolve => setTimeout(resolve, 100));
-            }
-        })();
-        """
-
-        for page in range(3):
-            config = CrawlerRunConfig(
-                url=url,
-                session_id=session_id,
-                js_code=js_next_page_and_wait if page > 0 else None,
-                css_selector="li.commit-item",
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS
-            )
-
-            result = await crawler.arun(config=config)
-            print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-
-asyncio.run(integrated_js_and_wait_crawl())
-```
-
----
-
-## Best Practices for Session-Based Crawling
-
-1. **Unique Session IDs**: Assign descriptive and unique `session_id` values.
-2. **Close Sessions**: Always clean up sessions with `kill_session` after use.
-3. **Error Handling**: Anticipate and handle errors gracefully.
-4. **Respect Websites**: Follow terms of service and robots.txt.
-5. **Delays**: Add delays to avoid overwhelming servers.
-6. **Optimize JavaScript**: Keep scripts concise for better performance.
-7. **Monitor Resources**: Track memory and CPU usage for long sessions.
-
----
-
-## Conclusion
-
-Session-based crawling in Crawl4AI is a robust solution for handling dynamic content and multi-step workflows. By combining session management, JavaScript execution, and structured extraction strategies, you can effectively navigate and extract data from modern web applications. Always adhere to ethical web scraping practices and respect website policies.
\ No newline at end of file
diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md
index e9348223..180dfc85 100644
--- a/docs/md_v2/advanced/session-management.md
+++ b/docs/md_v2/advanced/session-management.md
@@ -1,4 +1,4 @@
-### Session Management
+# Session Management
 
 Session management in Crawl4AI is a powerful feature that allows you to maintain state across multiple requests, making it particularly suitable for handling complex multi-step crawling tasks. It enables you to reuse the same browser tab (or page object) across sequential actions and crawls, which is beneficial for:
 
@@ -20,8 +20,12 @@ async with AsyncWebCrawler() as crawler:
     session_id = "my_session"
 
     # Define configurations
-    config1 = CrawlerRunConfig(url="https://example.com/page1", session_id=session_id)
-    config2 = CrawlerRunConfig(url="https://example.com/page2", session_id=session_id)
+    config1 = CrawlerRunConfig(
+        url="https://example.com/page1", session_id=session_id
+    )
+    config2 = CrawlerRunConfig(
+        url="https://example.com/page2", session_id=session_id
+    )
 
     # First request
     result1 = await crawler.arun(config=config1)
@@ -54,7 +58,9 @@ async def crawl_dynamic_content():
         schema = {
             "name": "Commit Extractor",
             "baseSelector": "li.Box-sc-g0xbh4-0",
-            "fields": [{"name": "title", "selector": "h4.markdown-title", "type": "text"}],
+            "fields": [{
+                "name": "title", "selector": "h4.markdown-title", "type": "text"
+            }],
         }
         extraction_strategy = JsonCssExtractionStrategy(schema)
 
@@ -87,51 +93,146 @@ async def crawl_dynamic_content():
 
 ---
 
-#### Session Best Practices
+## Example 1: Basic Session-Based Crawling
 
-1. **Descriptive Session IDs**:
-   Use meaningful names for session IDs to organize workflows:
-   ```python
-   session_id = "login_flow_session"
-   session_id = "product_catalog_session"
-   ```
+A simple example using session-based crawling:
 
-2. **Resource Management**:
-   Always ensure sessions are cleaned up to free resources:
-   ```python
-   try:
-       # Your crawling code here
-       pass
-   finally:
-       await crawler.crawler_strategy.kill_session(session_id)
-   ```
+```python
+import asyncio
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.cache_context import CacheMode
 
-3. **State Maintenance**:
-   Reuse the session for subsequent actions within the same workflow:
-   ```python
-   # Step 1: Login
-   login_config = CrawlerRunConfig(
-       url="https://example.com/login",
-       session_id=session_id,
-       js_code="document.querySelector('form').submit();"
-   )
-   await crawler.arun(config=login_config)
+async def basic_session_crawl():
+    async with AsyncWebCrawler() as crawler:
+        session_id = "dynamic_content_session"
+        url = "https://example.com/dynamic-content"
 
-   # Step 2: Verify login success
-   dashboard_config = CrawlerRunConfig(
-       url="https://example.com/dashboard",
-       session_id=session_id,
-       wait_for="css:.user-profile"  # Wait for authenticated content
-   )
-   result = await crawler.arun(config=dashboard_config)
-   ```
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
+                css_selector=".content-item",
+                cache_mode=CacheMode.BYPASS
+            )
+            
+            result = await crawler.arun(config=config)
+            print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(basic_session_crawl())
+```
+
+This example shows:
+1. Reusing the same `session_id` across multiple requests.
+2. Executing JavaScript to load more content dynamically.
+3. Properly closing the session to free resources.
+
+---
+
+## Advanced Technique 1: Custom Execution Hooks
+
+> Warning: You might feel confused by the end of the next few examples 😅, so make sure you are comfortable with the order of the parts before you start this.
+
+Use custom hooks to handle complex scenarios, such as waiting for content to load dynamically:
+
+```python
+async def advanced_session_crawl_with_hooks():
+    first_commit = ""
+
+    async def on_execution_started(page):
+        nonlocal first_commit
+        try:
+            while True:
+                await page.wait_for_selector("li.commit-item h4")
+                commit = await page.query_selector("li.commit-item h4")
+                commit = await commit.evaluate("(element) => element.textContent").strip()
+                if commit and commit != first_commit:
+                    first_commit = commit
+                    break
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            print(f"Warning: New content didn't appear: {e}")
+
+    async with AsyncWebCrawler() as crawler:
+        session_id = "commit_session"
+        url = "https://github.com/example/repo/commits/main"
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+        js_next_page = """document.querySelector('a.pagination-next').click();"""
+
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                js_code=js_next_page if page > 0 else None,
+                css_selector="li.commit-item",
+                js_only=page > 0,
+                cache_mode=CacheMode.BYPASS
+            )
+
+            result = await crawler.arun(config=config)
+            print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(advanced_session_crawl_with_hooks())
+```
+
+This technique ensures new content loads before the next action.
+
+---
+
+## Advanced Technique 2: Integrated JavaScript Execution and Waiting
+
+Combine JavaScript execution and waiting logic for concise handling of dynamic content:
+
+```python
+async def integrated_js_and_wait_crawl():
+    async with AsyncWebCrawler() as crawler:
+        session_id = "integrated_session"
+        url = "https://github.com/example/repo/commits/main"
+
+        js_next_page_and_wait = """
+        (async () => {
+            const getCurrentCommit = () => document.querySelector('li.commit-item h4').textContent.trim();
+            const initialCommit = getCurrentCommit();
+            document.querySelector('a.pagination-next').click();
+            while (getCurrentCommit() === initialCommit) {
+                await new Promise(resolve => setTimeout(resolve, 100));
+            }
+        })();
+        """
+
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                js_code=js_next_page_and_wait if page > 0 else None,
+                css_selector="li.commit-item",
+                js_only=page > 0,
+                cache_mode=CacheMode.BYPASS
+            )
+
+            result = await crawler.arun(config=config)
+            print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(integrated_js_and_wait_crawl())
+```
 
 ---
 
 #### Common Use Cases for Sessions
 
-1. **Authentication Flows**: Login and interact with secured pages.
-2. **Pagination Handling**: Navigate through multiple pages.
-3. **Form Submissions**: Fill forms, submit, and process results.
-4. **Multi-step Processes**: Complete workflows that span multiple actions.
-5. **Dynamic Content Navigation**: Handle JavaScript-rendered or event-triggered content.
+1. **Authentication Flows**: Login and interact with secured pages.
+
+2. **Pagination Handling**: Navigate through multiple pages.
+
+3. **Form Submissions**: Fill forms, submit, and process results.
+
+4. **Multi-step Processes**: Complete workflows that span multiple actions.
+
+5. **Dynamic Content Navigation**: Handle JavaScript-rendered or event-triggered content.
diff --git a/docs/md_v2/advanced/ssl-certificate.md b/docs/md_v2/advanced/ssl-certificate.md
new file mode 100644
index 00000000..fa04716a
--- /dev/null
+++ b/docs/md_v2/advanced/ssl-certificate.md
@@ -0,0 +1,179 @@
+# `SSLCertificate` Reference
+
+The **`SSLCertificate`** class encapsulates an SSL certificate’s data and allows exporting it in various formats (PEM, DER, JSON, or text). It’s used within **Crawl4AI** whenever you set **`fetch_ssl_certificate=True`** in your **`CrawlerRunConfig`**.  
+
+## 1. Overview
+
+**Location**: `crawl4ai/ssl_certificate.py`
+
+```python
+class SSLCertificate:
+    """
+    Represents an SSL certificate with methods to export in various formats.
+
+    Main Methods:
+    - from_url(url, timeout=10)
+    - from_file(file_path)
+    - from_binary(binary_data)
+    - to_json(filepath=None)
+    - to_pem(filepath=None)
+    - to_der(filepath=None)
+    ...
+
+    Common Properties:
+    - issuer
+    - subject
+    - valid_from
+    - valid_until
+    - fingerprint
+    """
+```
+
+### Typical Use Case
+1. You **enable** certificate fetching in your crawl by:
+   ```python
+   CrawlerRunConfig(fetch_ssl_certificate=True, ...)
+   ```
+2. After `arun()`, if `result.ssl_certificate` is present, it’s an instance of **`SSLCertificate`**.  
+3. You can **read** basic properties (issuer, subject, validity) or **export** them in multiple formats.
+
+---
+
+## 2. Construction & Fetching
+
+### 2.1 **`from_url(url, timeout=10)`**
+Manually load an SSL certificate from a given URL (port 443). Typically used internally, but you can call it directly if you want:
+
+```python
+cert = SSLCertificate.from_url("https://example.com")
+if cert:
+    print("Fingerprint:", cert.fingerprint)
+```
+
+### 2.2 **`from_file(file_path)`**
+Load from a file containing certificate data in ASN.1 or DER. Rarely needed unless you have local cert files:
+
+```python
+cert = SSLCertificate.from_file("/path/to/cert.der")
+```
+
+### 2.3 **`from_binary(binary_data)`**
+Initialize from raw binary. E.g., if you captured it from a socket or another source:
+
+```python
+cert = SSLCertificate.from_binary(raw_bytes)
+```
+
+---
+
+## 3. Common Properties
+
+After obtaining a **`SSLCertificate`** instance (e.g. `result.ssl_certificate` from a crawl), you can read:
+
+1. **`issuer`** *(dict)*  
+   - E.g. `{"CN": "My Root CA", "O": "..."}`
+2. **`subject`** *(dict)*  
+   - E.g. `{"CN": "example.com", "O": "ExampleOrg"}`
+3. **`valid_from`** *(str)*  
+   - NotBefore date/time. Often in ASN.1/UTC format.
+4. **`valid_until`** *(str)*  
+   - NotAfter date/time.
+5. **`fingerprint`** *(str)*  
+   - The SHA-256 digest (lowercase hex).  
+   - E.g. `"d14d2e..."`
+
+---
+
+## 4. Export Methods
+
+Once you have a **`SSLCertificate`** object, you can **export** or **inspect** it:
+
+### 4.1 **`to_json(filepath=None)` → `Optional[str]`**
+- Returns a JSON string containing the parsed certificate fields.  
+- If `filepath` is provided, saves it to disk instead, returning `None`.
+
+**Usage**:
+```python
+json_data = cert.to_json()  # returns JSON string
+cert.to_json("certificate.json")  # writes file, returns None
+```
+
+### 4.2 **`to_pem(filepath=None)` → `Optional[str]`**
+- Returns a PEM-encoded string (common for web servers).  
+- If `filepath` is provided, saves it to disk instead.
+
+```python
+pem_str = cert.to_pem()              # in-memory PEM string
+cert.to_pem("/path/to/cert.pem")     # saved to file
+```
+
+### 4.3 **`to_der(filepath=None)` → `Optional[bytes]`**
+- Returns the original DER (binary ASN.1) bytes.  
+- If `filepath` is specified, writes the bytes there instead.
+
+```python
+der_bytes = cert.to_der()
+cert.to_der("certificate.der")
+```
+
+### 4.4 (Optional) **`export_as_text()`**
+- If you see a method like `export_as_text()`, it typically returns an OpenSSL-style textual representation.  
+- Not always needed, but can help for debugging or manual inspection.
+
+---
+
+## 5. Example Usage in Crawl4AI
+
+Below is a minimal sample showing how the crawler obtains an SSL cert from a site, then reads or exports it. The code snippet:
+
+```python
+import asyncio
+import os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    tmp_dir = "tmp"
+    os.makedirs(tmp_dir, exist_ok=True)
+
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+            # 1. Basic Info
+            print("Issuer CN:", cert.issuer.get("CN", ""))
+            print("Valid until:", cert.valid_until)
+            print("Fingerprint:", cert.fingerprint)
+            
+            # 2. Export
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))
+            cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))
+            cert.to_der(os.path.join(tmp_dir, "certificate.der"))
+    
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 6. Notes & Best Practices
+
+1. **Timeout**: `SSLCertificate.from_url` internally uses a default **10s** socket connect and wraps SSL.  
+2. **Binary Form**: The certificate is loaded in ASN.1 (DER) form, then re-parsed by `OpenSSL.crypto`.  
+3. **Validation**: This does **not** validate the certificate chain or trust store. It only fetches and parses.  
+4. **Integration**: Within Crawl4AI, you typically just set `fetch_ssl_certificate=True` in `CrawlerRunConfig`; the final result’s `ssl_certificate` is automatically built.  
+5. **Export**: If you need to store or analyze a cert, the `to_json` and `to_pem` are quite universal.
+
+---
+
+### Summary
+
+- **`SSLCertificate`** is a convenience class for capturing and exporting the **TLS certificate** from your crawled site(s).  
+- Common usage is in the **`CrawlResult.ssl_certificate`** field, accessible after setting `fetch_ssl_certificate=True`.  
+- Offers quick access to essential certificate details (`issuer`, `subject`, `fingerprint`) and is easy to export (PEM, DER, JSON) for further analysis or server usage.
+
+Use it whenever you need **insight** into a site’s certificate or require some form of cryptographic or compliance check.
\ No newline at end of file
diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md
index 509991e5..5972f402 100644
--- a/docs/md_v2/api/arun.md
+++ b/docs/md_v2/api/arun.md
@@ -1,244 +1,301 @@
-# Complete Parameter Guide for arun()
+# `arun()` Parameter Guide (New Approach)
 
-The following parameters can be passed to the `arun()` method. They are organized by their primary usage context and functionality.
-
-## Core Parameters
+In Crawl4AI’s **latest** configuration model, nearly all parameters that once went directly to `arun()` are now part of **`CrawlerRunConfig`**. When calling `arun()`, you provide:
 
 ```python
 await crawler.arun(
-    url="https://example.com",   # Required: URL to crawl
-    verbose=True,               # Enable detailed logging
-    cache_mode=CacheMode.ENABLED,  # Control cache behavior
-    warmup=True                # Whether to run warmup check
+    url="https://example.com",  
+    config=my_run_config
 )
 ```
 
-## Cache Control
+Below is an organized look at the parameters that can go inside `CrawlerRunConfig`, divided by their functional areas. For **Browser** settings (e.g., `headless`, `browser_type`), see [BrowserConfig](./parameters.md).
+
+---
+
+## 1. Core Usage
 
 ```python
-from crawl4ai import CacheMode
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
 
-await crawler.arun(
-    cache_mode=CacheMode.ENABLED,    # Normal caching (read/write)
-    # Other cache modes:
-    # cache_mode=CacheMode.DISABLED   # No caching at all
-    # cache_mode=CacheMode.READ_ONLY  # Only read from cache
-    # cache_mode=CacheMode.WRITE_ONLY # Only write to cache
-    # cache_mode=CacheMode.BYPASS     # Skip cache for this operation
+async def main():
+    run_config = CrawlerRunConfig(
+        verbose=True,            # Detailed logging
+        cache_mode=CacheMode.ENABLED,  # Use normal read/write cache
+        # ... other parameters
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_config
+        )
+        print(result.cleaned_html[:500])
+
+```
+
+**Key Fields**:
+- `verbose=True` logs each crawl step.  
+- `cache_mode` decides how to read/write the local crawl cache.
+
+---
+
+## 2. Cache Control
+
+**`cache_mode`** (default: `CacheMode.ENABLED`)  
+Use a built-in enum from `CacheMode`:
+- `ENABLED`: Normal caching—reads if available, writes if missing.
+- `DISABLED`: No caching—always refetch pages.
+- `READ_ONLY`: Reads from cache only; no new writes.
+- `WRITE_ONLY`: Writes to cache but doesn’t read existing data.
+- `BYPASS`: Skips reading cache for this crawl (though it might still write if set up that way).
+
+```python
+run_config = CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS
 )
 ```
 
-## Content Processing Parameters
+**Additional flags**:
+- `bypass_cache=True` acts like `CacheMode.BYPASS`.
+- `disable_cache=True` acts like `CacheMode.DISABLED`.
+- `no_cache_read=True` acts like `CacheMode.WRITE_ONLY`.
+- `no_cache_write=True` acts like `CacheMode.READ_ONLY`.
+
+---
+
+## 3. Content Processing & Selection
+
+### 3.1 Text Processing
 
-### Text Processing
 ```python
-await crawler.arun(
-    word_count_threshold=10,                # Minimum words per content block
-    image_description_min_word_threshold=5,  # Minimum words for image descriptions
-    only_text=False,                        # Extract only text content
-    excluded_tags=['form', 'nav'],          # HTML tags to exclude
-    keep_data_attributes=False,             # Preserve data-* attributes
+run_config = CrawlerRunConfig(
+    word_count_threshold=10,   # Ignore text blocks <10 words
+    only_text=False,           # If True, tries to remove non-text elements
+    keep_data_attributes=False # Keep or discard data-* attributes
 )
 ```
 
-### Content Selection
+### 3.2 Content Selection
+
 ```python
-await crawler.arun(
-    css_selector=".main-content",  # CSS selector for content extraction
-    remove_forms=True,             # Remove all form elements
-    remove_overlay_elements=True,  # Remove popups/modals/overlays
+run_config = CrawlerRunConfig(
+    css_selector=".main-content",  # Focus on .main-content region only
+    excluded_tags=["form", "nav"], # Remove entire tag blocks
+    remove_forms=True,             # Specifically strip <form> elements
+    remove_overlay_elements=True,  # Attempt to remove modals/popups
 )
 ```
 
-### Link Handling
+### 3.3 Link Handling
+
 ```python
-await crawler.arun(
-    exclude_external_links=True,          # Remove external links
-    exclude_social_media_links=True,      # Remove social media links
-    exclude_external_images=True,         # Remove external images
-    exclude_domains=["ads.example.com"],  # Specific domains to exclude
-    social_media_domains=[               # Additional social media domains
-        "facebook.com",
-        "twitter.com",
-        "instagram.com"
-    ]
+run_config = CrawlerRunConfig(
+    exclude_external_links=True,         # Remove external links from final content
+    exclude_social_media_links=True,     # Remove links to known social sites
+    exclude_domains=["ads.example.com"], # Exclude links to these domains
+    exclude_social_media_domains=["facebook.com","twitter.com"], # Extend the default list
 )
 ```
 
-## Browser Control Parameters
+### 3.4 Media Filtering
 
-### Basic Browser Settings
 ```python
-await crawler.arun(
-    headless=True,                # Run browser in headless mode
-    browser_type="chromium",      # Browser engine: "chromium", "firefox", "webkit"
-    page_timeout=60000,          # Page load timeout in milliseconds
-    user_agent="custom-agent",    # Custom user agent
+run_config = CrawlerRunConfig(
+    exclude_external_images=True  # Strip images from other domains
 )
 ```
 
-### Navigation and Waiting
+---
+
+## 4. Page Navigation & Timing
+
+### 4.1 Basic Browser Flow
+
 ```python
-await crawler.arun(
-    wait_for="css:.dynamic-content",  # Wait for element/condition
-    delay_before_return_html=2.0,     # Wait before returning HTML (seconds)
+run_config = CrawlerRunConfig(
+    wait_for="css:.dynamic-content", # Wait for .dynamic-content
+    delay_before_return_html=2.0,    # Wait 2s before capturing final HTML
+    page_timeout=60000,             # Navigation & script timeout (ms)
 )
 ```
 
-### JavaScript Execution
+**Key Fields**:
+- `wait_for`:  
+  - `"css:selector"` or  
+  - `"js:() => boolean"`  
+  e.g. `js:() => document.querySelectorAll('.item').length > 10`.
+
+- `mean_delay` & `max_range`: define random delays for `arun_many()` calls.  
+- `semaphore_count`: concurrency limit when crawling multiple URLs.
+
+### 4.2 JavaScript Execution
+
 ```python
-await crawler.arun(
-    js_code=[                     # JavaScript to execute (string or list)
+run_config = CrawlerRunConfig(
+    js_code=[
         "window.scrollTo(0, document.body.scrollHeight);",
-        "document.querySelector('.load-more').click();"
+        "document.querySelector('.load-more')?.click();"
     ],
-    js_only=False,               # Only execute JavaScript without reloading page
+    js_only=False
 )
 ```
 
-### Anti-Bot Features
+- `js_code` can be a single string or a list of strings.  
+- `js_only=True` means “I’m continuing in the same session with new JS steps, no new full navigation.”
+
+### 4.3 Anti-Bot
+
 ```python
-await crawler.arun(
-    magic=True,              # Enable all anti-detection features
-    simulate_user=True,      # Simulate human behavior
-    override_navigator=True  # Override navigator properties
+run_config = CrawlerRunConfig(
+    magic=True,
+    simulate_user=True,
+    override_navigator=True
+)
+```
+- `magic=True` tries multiple stealth features.  
+- `simulate_user=True` mimics mouse movements or random delays.  
+- `override_navigator=True` fakes some navigator properties (like user agent checks).
+
+---
+
+## 5. Session Management
+
+**`session_id`**: 
+```python
+run_config = CrawlerRunConfig(
+    session_id="my_session123"
+)
+```
+If re-used in subsequent `arun()` calls, the same tab/page context is continued (helpful for multi-step tasks or stateful browsing).
+
+---
+
+## 6. Screenshot, PDF & Media Options
+
+```python
+run_config = CrawlerRunConfig(
+    screenshot=True,             # Grab a screenshot as base64
+    screenshot_wait_for=1.0,     # Wait 1s before capturing
+    pdf=True,                    # Also produce a PDF
+    image_description_min_word_threshold=5,  # If analyzing alt text
+    image_score_threshold=3,                # Filter out low-score images
+)
+```
+**Where they appear**:
+- `result.screenshot` → Base64 screenshot string.
+- `result.pdf` → Byte array with PDF data.
+
+---
+
+## 7. Extraction Strategy
+
+**For advanced data extraction** (CSS/LLM-based), set `extraction_strategy`:
+
+```python
+run_config = CrawlerRunConfig(
+    extraction_strategy=my_css_or_llm_strategy
 )
 ```
 
-### Session Management
-```python
-await crawler.arun(
-    session_id="my_session",  # Session identifier for persistent browsing
-)
-```
+The extracted data will appear in `result.extracted_content`.
 
-### Screenshot Options
-```python
-await crawler.arun(
-    screenshot=True,              # Take page screenshot
-    screenshot_wait_for=2.0,      # Wait before screenshot (seconds)
-)
-```
+---
+
+## 8. Comprehensive Example
+
+Below is a snippet combining many parameters:
 
-### Proxy Configuration
 ```python
-await crawler.arun(
-    proxy="http://proxy.example.com:8080",     # Simple proxy URL
-    proxy_config={                             # Advanced proxy settings
-        "server": "http://proxy.example.com:8080",
-        "username": "user",
-        "password": "pass"
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    # Example schema
+    schema = {
+        "name": "Articles",
+        "baseSelector": "article.post",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link",  "selector": "a",  "type": "attribute", "attribute": "href"}
+        ]
     }
-)
-```
 
-## Content Extraction Parameters
-
-### Extraction Strategy
-```python
-await crawler.arun(
-    extraction_strategy=LLMExtractionStrategy(
-        provider="ollama/llama2",
-        schema=MySchema.schema(),
-        instruction="Extract specific data"
+    run_config = CrawlerRunConfig(
+        # Core
+        verbose=True,
+        cache_mode=CacheMode.ENABLED,
+        
+        # Content
+        word_count_threshold=10,
+        css_selector="main.content",
+        excluded_tags=["nav", "footer"],
+        exclude_external_links=True,
+        
+        # Page & JS
+        js_code="document.querySelector('.show-more')?.click();",
+        wait_for="css:.loaded-block",
+        page_timeout=30000,
+        
+        # Extraction
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        
+        # Session
+        session_id="persistent_session",
+        
+        # Media
+        screenshot=True,
+        pdf=True,
+        
+        # Anti-bot
+        simulate_user=True,
+        magic=True,
     )
-)
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com/posts", config=run_config)
+        if result.success:
+            print("HTML length:", len(result.cleaned_html))
+            print("Extraction JSON:", result.extracted_content)
+            if result.screenshot:
+                print("Screenshot length:", len(result.screenshot))
+            if result.pdf:
+                print("PDF bytes length:", len(result.pdf))
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
 ```
 
-### Chunking Strategy
-```python
-await crawler.arun(
-    chunking_strategy=RegexChunking(
-        patterns=[r'\n\n', r'\.\s+']
-    )
-)
-```
+**What we covered**:
+1. **Crawling** the main content region, ignoring external links.  
+2. Running **JavaScript** to click “.show-more”.  
+3. **Waiting** for “.loaded-block” to appear.  
+4. Generating a **screenshot** & **PDF** of the final page.  
+5. Extracting repeated “article.post” elements with a **CSS-based** extraction strategy.
 
-### HTML to Text Options
-```python
-await crawler.arun(
-    html2text={
-        "ignore_links": False,
-        "ignore_images": False,
-        "escape_dot": False,
-        "body_width": 0,
-        "protect_links": True,
-        "unicode_snob": True
-    }
-)
-```
+---
 
-## Debug Options
-```python
-await crawler.arun(
-    log_console=True,   # Log browser console messages
-)
-```
+## 9. Best Practices
 
-## Parameter Interactions and Notes
+1. **Use `BrowserConfig` for global browser** settings (headless, user agent).  
+2. **Use `CrawlerRunConfig`** to handle the **specific** crawl needs: content filtering, caching, JS, screenshot, extraction, etc.  
+3. Keep your **parameters consistent** in run configs—especially if you’re part of a large codebase with multiple crawls.  
+4. **Limit** large concurrency (`semaphore_count`) if the site or your system can’t handle it.  
+5. For dynamic pages, set `js_code` or `scan_full_page` so you load all content.
 
-1. **Cache and Performance Setup**
-   ```python
-   # Optimal caching for repeated crawls
-   await crawler.arun(
-       cache_mode=CacheMode.ENABLED,
-       word_count_threshold=10,
-       process_iframes=False
-   )
-   ```
+---
 
-2. **Dynamic Content Handling**
-   ```python
-   # Handle lazy-loaded content
-   await crawler.arun(
-       js_code="window.scrollTo(0, document.body.scrollHeight);",
-       wait_for="css:.lazy-content",
-       delay_before_return_html=2.0,
-       cache_mode=CacheMode.WRITE_ONLY  # Cache results after dynamic load
-   )
-   ```
+## 10. Conclusion
 
-3. **Content Extraction Pipeline**
-   ```python
-   # Complete extraction setup
-   await crawler.arun(
-       css_selector=".main-content",
-       word_count_threshold=20,
-       extraction_strategy=my_strategy,
-       chunking_strategy=my_chunking,
-       process_iframes=True,
-       remove_overlay_elements=True,
-       cache_mode=CacheMode.ENABLED
-   )
-   ```
+All parameters that used to be direct arguments to `arun()` now belong in **`CrawlerRunConfig`**. This approach:
 
-## Best Practices
+- Makes code **clearer** and **more maintainable**.  
+- Minimizes confusion about which arguments affect global vs. per-crawl behavior.  
+- Allows you to create **reusable** config objects for different pages or tasks.
 
-1. **Performance Optimization**
-   ```python
-   await crawler.arun(
-       cache_mode=CacheMode.ENABLED,  # Use full caching
-       word_count_threshold=10,      # Filter out noise
-       process_iframes=False         # Skip iframes if not needed
-   )
-   ```
+For a **full** reference, check out the [CrawlerRunConfig Docs](./parameters.md). 
 
-2. **Reliable Scraping**
-   ```python
-   await crawler.arun(
-       magic=True,                   # Enable anti-detection
-       delay_before_return_html=1.0, # Wait for dynamic content
-       page_timeout=60000,          # Longer timeout for slow pages
-       cache_mode=CacheMode.WRITE_ONLY  # Cache results after successful crawl
-   )
-   ```
-
-3. **Clean Content**
-   ```python
-   await crawler.arun(
-       remove_overlay_elements=True,  # Remove popups
-       excluded_tags=['nav', 'aside'],# Remove unnecessary elements
-       keep_data_attributes=False,    # Remove data attributes
-       cache_mode=CacheMode.ENABLED   # Use cache for faster processing
-   )
-   ```
\ No newline at end of file
+Happy crawling with your **structured, flexible** config approach!
\ No newline at end of file
diff --git a/docs/md_v2/api/arun_many.md b/docs/md_v2/api/arun_many.md
new file mode 100644
index 00000000..edc01145
--- /dev/null
+++ b/docs/md_v2/api/arun_many.md
@@ -0,0 +1,124 @@
+# `arun_many(...)` Reference
+
+> **Note**: This function is very similar to [`arun()`](./arun.md) but focused on **concurrent** or **batch** crawling. If you’re unfamiliar with `arun()` usage, please read that doc first, then review this for differences.
+
+## Function Signature
+
+```python
+async def arun_many(
+    urls: Union[List[str], List[Any]],
+    config: Optional[CrawlerRunConfig] = None,
+    dispatcher: Optional[BaseDispatcher] = None,
+    ...
+) -> Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
+    """
+    Crawl multiple URLs concurrently or in batches.
+
+    :param urls: A list of URLs (or tasks) to crawl.
+    :param config: (Optional) A default `CrawlerRunConfig` applying to each crawl.
+    :param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
+    ...
+    :return: Either a list of `CrawlResult` objects, or an async generator if streaming is enabled.
+    """
+```
+
+## Differences from `arun()`
+
+1. **Multiple URLs**:  
+   - Instead of crawling a single URL, you pass a list of them (strings or tasks).  
+   - The function returns either a **list** of `CrawlResult` or an **async generator** if streaming is enabled.
+
+2. **Concurrency & Dispatchers**:  
+   - **`dispatcher`** param allows advanced concurrency control.  
+   - If omitted, a default dispatcher (like `MemoryAdaptiveDispatcher`) is used internally.  
+   - Dispatchers handle concurrency, rate limiting, and memory-based adaptive throttling (see [Multi-URL Crawling](../advanced/multi-url-crawling.md)).
+
+3. **Streaming Support**:  
+   - Enable streaming by setting `stream=True` in your `CrawlerRunConfig`.
+   - When streaming, use `async for` to process results as they become available.
+   - Ideal for processing large numbers of URLs without waiting for all to complete.
+
+4. **Parallel** Execution**:  
+   - `arun_many()` can run multiple requests concurrently under the hood.  
+   - Each `CrawlResult` might also include a **`dispatch_result`** with concurrency details (like memory usage, start/end times).
+
+### Basic Example (Batch Mode)
+
+```python
+# Minimal usage: The default dispatcher will be used
+results = await crawler.arun_many(
+    urls=["https://site1.com", "https://site2.com"],
+    config=CrawlerRunConfig(stream=False)  # Default behavior
+)
+
+for res in results:
+    if res.success:
+        print(res.url, "crawled OK!")
+    else:
+        print("Failed:", res.url, "-", res.error_message)
+```
+
+### Streaming Example
+
+```python
+config = CrawlerRunConfig(
+    stream=True,  # Enable streaming mode
+    cache_mode=CacheMode.BYPASS
+)
+
+# Process results as they complete
+async for result in await crawler.arun_many(
+    urls=["https://site1.com", "https://site2.com", "https://site3.com"],
+    config=config
+):
+    if result.success:
+        print(f"Just completed: {result.url}")
+        # Process each result immediately
+        process_result(result)
+```
+
+### With a Custom Dispatcher
+
+```python
+dispatcher = MemoryAdaptiveDispatcher(
+    memory_threshold_percent=70.0,
+    max_session_permit=10
+)
+results = await crawler.arun_many(
+    urls=["https://site1.com", "https://site2.com", "https://site3.com"],
+    config=my_run_config,
+    dispatcher=dispatcher
+)
+```
+
+**Key Points**:
+- Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
+- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.  
+- If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
+
+### Return Value
+
+Either a **list** of [`CrawlResult`](./crawl-result.md) objects, or an **async generator** if streaming is enabled. You can iterate to check `result.success` or read each item’s `extracted_content`, `markdown`, or `dispatch_result`.
+
+---
+
+## Dispatcher Reference
+
+- **`MemoryAdaptiveDispatcher`**: Dynamically manages concurrency based on system memory usage.  
+- **`SemaphoreDispatcher`**: Fixed concurrency limit, simpler but less adaptive.  
+
+For advanced usage or custom settings, see [Multi-URL Crawling with Dispatchers](../advanced/multi-url-crawling.md).
+
+---
+
+## Common Pitfalls
+
+1. **Large Lists**: If you pass thousands of URLs, be mindful of memory or rate-limits. A dispatcher can help.  
+2. **Session Reuse**: If you need specialized logins or persistent contexts, ensure your dispatcher or tasks handle sessions accordingly.  
+3. **Error Handling**: Each `CrawlResult` might fail for different reasons—always check `result.success` or the `error_message` before proceeding.
+
+---
+
+## Conclusion
+
+Use `arun_many()` when you want to **crawl multiple URLs** simultaneously or in controlled parallel tasks. If you need advanced concurrency features (like memory-based adaptive throttling or complex rate-limiting), provide a **dispatcher**. Each result is a standard `CrawlResult`, possibly augmented with concurrency stats (`dispatch_result`) for deeper inspection. For more details on concurrency logic and dispatchers, see the [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md) docs.
\ No newline at end of file
diff --git a/docs/md_v2/api/async-webcrawler.md b/docs/md_v2/api/async-webcrawler.md
index be956101..e9c6cc6b 100644
--- a/docs/md_v2/api/async-webcrawler.md
+++ b/docs/md_v2/api/async-webcrawler.md
@@ -1,320 +1,331 @@
 # AsyncWebCrawler
 
-The `AsyncWebCrawler` class is the main interface for web crawling operations. It provides asynchronous web crawling capabilities with extensive configuration options.
+The **`AsyncWebCrawler`** is the core class for asynchronous web crawling in Crawl4AI. You typically create it **once**, optionally customize it with a **`BrowserConfig`** (e.g., headless, user agent), then **run** multiple **`arun()`** calls with different **`CrawlerRunConfig`** objects.
 
-## Constructor
+**Recommended usage**:
+1. **Create** a `BrowserConfig` for global browser settings.  
+2. **Instantiate** `AsyncWebCrawler(config=browser_config)`.  
+3. **Use** the crawler in an async context manager (`async with`) or manage start/close manually.  
+4. **Call** `arun(url, config=crawler_run_config)` for each page you want.
+
+---
+
+## 1. Constructor Overview
 
 ```python
-AsyncWebCrawler(
-    # Browser Settings
-    browser_type: str = "chromium",         # Options: "chromium", "firefox", "webkit"
-    headless: bool = True,                  # Run browser in headless mode
-    verbose: bool = False,                  # Enable verbose logging
-    
-    # Cache Settings
-    always_by_pass_cache: bool = False,     # Always bypass cache
-    base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache
-    
-    # Network Settings
-    proxy: str = None,                      # Simple proxy URL
-    proxy_config: Dict = None,              # Advanced proxy configuration
-    
-    # Browser Behavior
-    sleep_on_close: bool = False,           # Wait before closing browser
-    
-    # Custom Settings
-    user_agent: str = None,                 # Custom user agent
-    headers: Dict[str, str] = {},           # Custom HTTP headers
-    js_code: Union[str, List[str]] = None,  # Default JavaScript to execute
+class AsyncWebCrawler:
+    def __init__(
+        self,
+        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
+        config: Optional[BrowserConfig] = None,
+        always_bypass_cache: bool = False,           # deprecated
+        always_by_pass_cache: Optional[bool] = None, # also deprecated
+        base_directory: str = ...,
+        thread_safe: bool = False,
+        **kwargs,
+    ):
+        """
+        Create an AsyncWebCrawler instance.
+
+        Args:
+            crawler_strategy: 
+                (Advanced) Provide a custom crawler strategy if needed.
+            config: 
+                A BrowserConfig object specifying how the browser is set up.
+            always_bypass_cache: 
+                (Deprecated) Use CrawlerRunConfig.cache_mode instead.
+            base_directory:     
+                Folder for storing caches/logs (if relevant).
+            thread_safe: 
+                If True, attempts some concurrency safeguards. Usually False.
+            **kwargs: 
+                Additional legacy or debugging parameters.
+        """
+    )
+
+### Typical Initialization
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+browser_cfg = BrowserConfig(
+    browser_type="chromium",
+    headless=True,
+    verbose=True
 )
+
+crawler = AsyncWebCrawler(config=browser_cfg)
 ```
 
-### Parameters in Detail
+**Notes**:
+- **Legacy** parameters like `always_bypass_cache` remain for backward compatibility, but prefer to set **caching** in `CrawlerRunConfig`.
 
-#### Browser Settings
+---
 
-- **browser_type** (str, optional)
-  - Default: `"chromium"`
-  - Options: `"chromium"`, `"firefox"`, `"webkit"`
-  - Controls which browser engine to use
-  ```python
-  # Example: Using Firefox
-  crawler = AsyncWebCrawler(browser_type="firefox")
-  ```
+## 2. Lifecycle: Start/Close or Context Manager
 
-- **headless** (bool, optional)
-  - Default: `True`
-  - When `True`, browser runs without GUI
-  - Set to `False` for debugging
-  ```python
-  # Visible browser for debugging
-  crawler = AsyncWebCrawler(headless=False)
-  ```
+### 2.1 Context Manager (Recommended)
 
-- **verbose** (bool, optional)
-  - Default: `False`
-  - Enables detailed logging
-  ```python
-  # Enable detailed logging
-  crawler = AsyncWebCrawler(verbose=True)
-  ```
+```python
+async with AsyncWebCrawler(config=browser_cfg) as crawler:
+    result = await crawler.arun("https://example.com")
+    # The crawler automatically starts/closes resources
+```
 
-#### Cache Settings
+When the `async with` block ends, the crawler cleans up (closes the browser, etc.).
 
-- **always_by_pass_cache** (bool, optional)
-  - Default: `False`
-  - When `True`, always fetches fresh content
-  ```python
-  # Always fetch fresh content
-  crawler = AsyncWebCrawler(always_by_pass_cache=True)
-  ```
+### 2.2 Manual Start & Close
 
-- **base_directory** (str, optional)
-  - Default: User's home directory
-  - Base path for cache storage
-  ```python
-  # Custom cache directory
-  crawler = AsyncWebCrawler(base_directory="/path/to/cache")
-  ```
+```python
+crawler = AsyncWebCrawler(config=browser_cfg)
+await crawler.start()
 
-#### Network Settings
+result1 = await crawler.arun("https://example.com")
+result2 = await crawler.arun("https://another.com")
 
-- **proxy** (str, optional)
-  - Simple proxy URL
-  ```python
-  # Using simple proxy
-  crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080")
-  ```
+await crawler.close()
+```
 
-- **proxy_config** (Dict, optional)
-  - Advanced proxy configuration with authentication
-  ```python
-  # Advanced proxy with auth
-  crawler = AsyncWebCrawler(proxy_config={
-      "server": "http://proxy.example.com:8080",
-      "username": "user",
-      "password": "pass"
-  })
-  ```
+Use this style if you have a **long-running** application or need full control of the crawler’s lifecycle.
 
-#### Browser Behavior
+---
 
-- **sleep_on_close** (bool, optional)
-  - Default: `False`
-  - Adds delay before closing browser
-  ```python
-  # Wait before closing
-  crawler = AsyncWebCrawler(sleep_on_close=True)
-  ```
-
-#### Custom Settings
-
-- **user_agent** (str, optional)
-  - Custom user agent string
-  ```python
-  # Custom user agent
-  crawler = AsyncWebCrawler(
-      user_agent="Mozilla/5.0 (Custom Agent) Chrome/90.0"
-  )
-  ```
-
-- **headers** (Dict[str, str], optional)
-  - Custom HTTP headers
-  ```python
-  # Custom headers
-  crawler = AsyncWebCrawler(
-      headers={
-          "Accept-Language": "en-US",
-          "Custom-Header": "Value"
-      }
-  )
-  ```
-
-- **js_code** (Union[str, List[str]], optional)
-  - Default JavaScript to execute on each page
-  ```python
-  # Default JavaScript
-  crawler = AsyncWebCrawler(
-      js_code=[
-          "window.scrollTo(0, document.body.scrollHeight);",
-          "document.querySelector('.load-more').click();"
-      ]
-  )
-  ```
-
-## Methods
-
-### arun()
-
-The primary method for crawling web pages.
+## 3. Primary Method: `arun()`
 
 ```python
 async def arun(
-    # Required
-    url: str,                              # URL to crawl
-    
-    # Content Selection
-    css_selector: str = None,              # CSS selector for content
-    word_count_threshold: int = 10,        # Minimum words per block
-    
-    # Cache Control
-    bypass_cache: bool = False,            # Bypass cache for this request
-    
-    # Session Management
-    session_id: str = None,                # Session identifier
-    
-    # Screenshot Options
-    screenshot: bool = False,              # Take screenshot
-    screenshot_wait_for: float = None,     # Wait before screenshot
-    
-    # Content Processing
-    process_iframes: bool = False,         # Process iframe content
-    remove_overlay_elements: bool = False, # Remove popups/modals
-    
-    # Anti-Bot Settings
-    simulate_user: bool = False,           # Simulate human behavior
-    override_navigator: bool = False,      # Override navigator properties
-    magic: bool = False,                   # Enable all anti-detection
-    
-    # Content Filtering
-    excluded_tags: List[str] = None,       # HTML tags to exclude
-    exclude_external_links: bool = False,  # Remove external links
-    exclude_social_media_links: bool = False, # Remove social media links
-    
-    # JavaScript Handling
-    js_code: Union[str, List[str]] = None, # JavaScript to execute
-    wait_for: str = None,                  # Wait condition
-    
-    # Page Loading
-    page_timeout: int = 60000,            # Page load timeout (ms)
-    delay_before_return_html: float = None, # Wait before return
-    
-    # Extraction
-    extraction_strategy: ExtractionStrategy = None  # Extraction strategy
+    self,
+    url: str,
+    config: Optional[CrawlerRunConfig] = None,
+    # Legacy parameters for backward compatibility...
 ) -> CrawlResult:
+    ...
 ```
 
-### Usage Examples
+### 3.1 New Approach
 
-#### Basic Crawling
-```python
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun(url="https://example.com")
-```
-
-#### Advanced Crawling
-```python
-async with AsyncWebCrawler(
-    browser_type="firefox",
-    verbose=True,
-    headers={"Custom-Header": "Value"}
-) as crawler:
-    result = await crawler.arun(
-        url="https://example.com",
-        css_selector=".main-content",
-        word_count_threshold=20,
-        process_iframes=True,
-        magic=True,
-        wait_for="css:.dynamic-content",
-        screenshot=True
-    )
-```
-
-#### Session Management
-```python
-async with AsyncWebCrawler() as crawler:
-    # First request
-    result1 = await crawler.arun(
-        url="https://example.com/login",
-        session_id="my_session"
-    )
-    
-    # Subsequent request using same session
-    result2 = await crawler.arun(
-        url="https://example.com/protected",
-        session_id="my_session"
-    )
-```
-
-## Context Manager
-
-AsyncWebCrawler implements the async context manager protocol:
+You pass a `CrawlerRunConfig` object that sets up everything about a crawl—content filtering, caching, session reuse, JS code, screenshots, etc.
 
 ```python
-async def __aenter__(self) -> 'AsyncWebCrawler':
-    # Initialize browser and resources
-    return self
+import asyncio
+from crawl4ai import CrawlerRunConfig, CacheMode
 
-async def __aexit__(self, *args):
-    # Cleanup resources
-    pass
-```
-
-Always use AsyncWebCrawler with async context manager:
-```python
-async with AsyncWebCrawler() as crawler:
-    # Your crawling code here
-    pass
-```
-
-## Best Practices
-
-1. **Resource Management**
-```python
-# Always use context manager
-async with AsyncWebCrawler() as crawler:
-    # Crawler will be properly cleaned up
-    pass
-```
-
-2. **Error Handling**
-```python
-try:
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(url="https://example.com")
-        if not result.success:
-            print(f"Crawl failed: {result.error_message}")
-except Exception as e:
-    print(f"Error: {str(e)}")
-```
-
-3. **Performance Optimization**
-```python
-# Enable caching for better performance
-crawler = AsyncWebCrawler(
-    always_by_pass_cache=False,
-    verbose=True
+run_cfg = CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS,
+    css_selector="main.article",
+    word_count_threshold=10,
+    screenshot=True
 )
+
+async with AsyncWebCrawler(config=browser_cfg) as crawler:
+    result = await crawler.arun("https://example.com/news", config=run_cfg)
+    print("Crawled HTML length:", len(result.cleaned_html))
+    if result.screenshot:
+        print("Screenshot base64 length:", len(result.screenshot))
 ```
 
-4. **Anti-Detection**
+### 3.2 Legacy Parameters Still Accepted
+
+For **backward** compatibility, `arun()` can still accept direct arguments like `css_selector=...`, `word_count_threshold=...`, etc., but we strongly advise migrating them into a **`CrawlerRunConfig`**.
+
+---
+
+## 4. Batch Processing: `arun_many()`
+
 ```python
-# Maximum stealth
-crawler = AsyncWebCrawler(
-    headless=True,
-    user_agent="Mozilla/5.0...",
-    headers={"Accept-Language": "en-US"}
-)
-result = await crawler.arun(
-    url="https://example.com",
-    magic=True,
-    simulate_user=True
-)
+async def arun_many(
+    self,
+    urls: List[str],
+    config: Optional[CrawlerRunConfig] = None,
+    # Legacy parameters maintained for backwards compatibility...
+) -> List[CrawlResult]:
+    """
+    Process multiple URLs with intelligent rate limiting and resource monitoring.
+    """
 ```
 
-## Note on Browser Types
+### 4.1 Resource-Aware Crawling
 
-Each browser type has its characteristics:
+The `arun_many()` method now uses an intelligent dispatcher that:
+- Monitors system memory usage
+- Implements adaptive rate limiting
+- Provides detailed progress monitoring
+- Manages concurrent crawls efficiently
 
-- **chromium**: Best overall compatibility
-- **firefox**: Good for specific use cases
-- **webkit**: Lighter weight, good for basic crawling
+### 4.2 Example Usage
 
-Choose based on your specific needs:
 ```python
-# High compatibility
-crawler = AsyncWebCrawler(browser_type="chromium")
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, RateLimitConfig
+from crawl4ai.dispatcher import DisplayMode
 
-# Memory efficient
-crawler = AsyncWebCrawler(browser_type="webkit")
-```
\ No newline at end of file
+# Configure browser
+browser_cfg = BrowserConfig(headless=True)
+
+# Configure crawler with rate limiting
+run_cfg = CrawlerRunConfig(
+    # Enable rate limiting
+    enable_rate_limiting=True,
+    rate_limit_config=RateLimitConfig(
+        base_delay=(1.0, 2.0),  # Random delay between 1-2 seconds
+        max_delay=30.0,         # Maximum delay after rate limit hits
+        max_retries=2,          # Number of retries before giving up
+        rate_limit_codes=[429, 503]  # Status codes that trigger rate limiting
+    ),
+    # Resource monitoring
+    memory_threshold_percent=70.0,  # Pause if memory exceeds this
+    check_interval=0.5,            # How often to check resources
+    max_session_permit=3,          # Maximum concurrent crawls
+    display_mode=DisplayMode.DETAILED.value  # Show detailed progress
+)
+
+urls = [
+    "https://example.com/page1",
+    "https://example.com/page2",
+    "https://example.com/page3"
+]
+
+async with AsyncWebCrawler(config=browser_cfg) as crawler:
+    results = await crawler.arun_many(urls, config=run_cfg)
+    for result in results:
+        print(f"URL: {result.url}, Success: {result.success}")
+```
+
+### 4.3 Key Features
+
+1. **Rate Limiting**
+   - Automatic delay between requests
+   - Exponential backoff on rate limit detection
+   - Domain-specific rate limiting
+   - Configurable retry strategy
+
+2. **Resource Monitoring**
+   - Memory usage tracking
+   - Adaptive concurrency based on system load
+   - Automatic pausing when resources are constrained
+
+3. **Progress Monitoring**
+   - Detailed or aggregated progress display
+   - Real-time status updates
+   - Memory usage statistics
+
+4. **Error Handling**
+   - Graceful handling of rate limits
+   - Automatic retries with backoff
+   - Detailed error reporting
+
+---
+
+## 5. `CrawlResult` Output
+
+Each `arun()` returns a **`CrawlResult`** containing:
+
+- `url`: Final URL (if redirected).
+- `html`: Original HTML.
+- `cleaned_html`: Sanitized HTML.
+- `markdown_v2` (or future `markdown`): Markdown outputs (raw, fit, etc.).
+- `extracted_content`: If an extraction strategy was used (JSON for CSS/LLM strategies).
+- `screenshot`, `pdf`: If screenshots/PDF requested.
+- `media`, `links`: Information about discovered images/links.
+- `success`, `error_message`: Status info.
+
+For details, see [CrawlResult doc](./crawl-result.md).
+
+---
+
+## 6. Quick Example
+
+Below is an example hooking it all together:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+import json
+
+async def main():
+    # 1. Browser config
+    browser_cfg = BrowserConfig(
+        browser_type="firefox",
+        headless=False,
+        verbose=True
+    )
+
+    # 2. Run config
+    schema = {
+        "name": "Articles",
+        "baseSelector": "article.post",
+        "fields": [
+            {
+                "name": "title", 
+                "selector": "h2", 
+                "type": "text"
+            },
+            {
+                "name": "url", 
+                "selector": "a", 
+                "type": "attribute", 
+                "attribute": "href"
+            }
+        ]
+    }
+
+    run_cfg = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        word_count_threshold=15,
+        remove_overlay_elements=True,
+        wait_for="css:.post"  # Wait for posts to appear
+    )
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        result = await crawler.arun(
+            url="https://example.com/blog",
+            config=run_cfg
+        )
+
+        if result.success:
+            print("Cleaned HTML length:", len(result.cleaned_html))
+            if result.extracted_content:
+                articles = json.loads(result.extracted_content)
+                print("Extracted articles:", articles[:2])
+        else:
+            print("Error:", result.error_message)
+
+asyncio.run(main())
+```
+
+**Explanation**:
+- We define a **`BrowserConfig`** with Firefox, no headless, and `verbose=True`.  
+- We define a **`CrawlerRunConfig`** that **bypasses cache**, uses a **CSS** extraction schema, has a `word_count_threshold=15`, etc.  
+- We pass them to `AsyncWebCrawler(config=...)` and `arun(url=..., config=...)`.
+
+---
+
+## 7. Best Practices & Migration Notes
+
+1. **Use** `BrowserConfig` for **global** settings about the browser’s environment.  
+2. **Use** `CrawlerRunConfig` for **per-crawl** logic (caching, content filtering, extraction strategies, wait conditions).  
+3. **Avoid** legacy parameters like `css_selector` or `word_count_threshold` directly in `arun()`. Instead:
+
+   ```python
+   run_cfg = CrawlerRunConfig(css_selector=".main-content", word_count_threshold=20)
+   result = await crawler.arun(url="...", config=run_cfg)
+   ```
+
+4. **Context Manager** usage is simplest unless you want a persistent crawler across many calls.
+
+---
+
+## 8. Summary
+
+**AsyncWebCrawler** is your entry point to asynchronous crawling:
+
+- **Constructor** accepts **`BrowserConfig`** (or defaults).  
+- **`arun(url, config=CrawlerRunConfig)`** is the main method for single-page crawls.  
+- **`arun_many(urls, config=CrawlerRunConfig)`** handles concurrency across multiple URLs.  
+- For advanced lifecycle control, use `start()` and `close()` explicitly.  
+
+**Migration**:  
+- If you used `AsyncWebCrawler(browser_type="chromium", css_selector="...")`, move browser settings to `BrowserConfig(...)` and content/crawl logic to `CrawlerRunConfig(...)`.
+
+This modular approach ensures your code is **clean**, **scalable**, and **easy to maintain**. For any advanced or rarely used parameters, see the [BrowserConfig docs](../api/parameters.md).
\ No newline at end of file
diff --git a/docs/md_v2/api/crawl-config.md b/docs/md_v2/api/crawl-config.md
deleted file mode 100644
index 928ae1e2..00000000
--- a/docs/md_v2/api/crawl-config.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# CrawlerRunConfig Parameters Documentation
-
-## Content Processing Parameters
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `word_count_threshold` | int | 200 | Minimum word count threshold before processing content |
-| `extraction_strategy` | ExtractionStrategy | None | Strategy to extract structured data from crawled pages. When None, uses NoExtractionStrategy |
-| `chunking_strategy` | ChunkingStrategy | RegexChunking() | Strategy to chunk content before extraction |
-| `markdown_generator` | MarkdownGenerationStrategy | None | Strategy for generating markdown from extracted content |
-| `content_filter` | RelevantContentFilter | None | Optional filter to prune irrelevant content |
-| `only_text` | bool | False | If True, attempt to extract text-only content where applicable |
-| `css_selector` | str | None | CSS selector to extract a specific portion of the page |
-| `excluded_tags` | list[str] | [] | List of HTML tags to exclude from processing |
-| `keep_data_attributes` | bool | False | If True, retain `data-*` attributes while removing unwanted attributes |
-| `remove_forms` | bool | False | If True, remove all `<form>` elements from the HTML |
-| `prettiify` | bool | False | If True, apply `fast_format_html` to produce prettified HTML output |
-
-## Caching Parameters
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `cache_mode` | CacheMode | None | Defines how caching is handled. Defaults to CacheMode.ENABLED internally |
-| `session_id` | str | None | Optional session ID to persist browser context and page instance |
-| `bypass_cache` | bool | False | Legacy parameter, if True acts like CacheMode.BYPASS |
-| `disable_cache` | bool | False | Legacy parameter, if True acts like CacheMode.DISABLED |
-| `no_cache_read` | bool | False | Legacy parameter, if True acts like CacheMode.WRITE_ONLY |
-| `no_cache_write` | bool | False | Legacy parameter, if True acts like CacheMode.READ_ONLY |
-
-## Page Navigation and Timing Parameters
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `wait_until` | str | "domcontentloaded" | The condition to wait for when navigating |
-| `page_timeout` | int | 60000 | Timeout in milliseconds for page operations like navigation |
-| `wait_for` | str | None | CSS selector or JS condition to wait for before extracting content |
-| `wait_for_images` | bool | True | If True, wait for images to load before extracting content |
-| `delay_before_return_html` | float | 0.1 | Delay in seconds before retrieving final HTML |
-| `mean_delay` | float | 0.1 | Mean base delay between requests when calling arun_many |
-| `max_range` | float | 0.3 | Max random additional delay range for requests in arun_many |
-| `semaphore_count` | int | 5 | Number of concurrent operations allowed |
-
-## Page Interaction Parameters
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `js_code` | str or list[str] | None | JavaScript code/snippets to run on the page |
-| `js_only` | bool | False | If True, indicates subsequent calls are JS-driven updates |
-| `ignore_body_visibility` | bool | True | If True, ignore whether the body is visible before proceeding |
-| `scan_full_page` | bool | False | If True, scroll through the entire page to load all content |
-| `scroll_delay` | float | 0.2 | Delay in seconds between scroll steps if scan_full_page is True |
-| `process_iframes` | bool | False | If True, attempts to process and inline iframe content |
-| `remove_overlay_elements` | bool | False | If True, remove overlays/popups before extracting HTML |
-| `simulate_user` | bool | False | If True, simulate user interactions for anti-bot measures |
-| `override_navigator` | bool | False | If True, overrides navigator properties for more human-like behavior |
-| `magic` | bool | False | If True, attempts automatic handling of overlays/popups |
-| `adjust_viewport_to_content` | bool | False | If True, adjust viewport according to page content dimensions |
-
-## Media Handling Parameters
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `screenshot` | bool | False | Whether to take a screenshot after crawling |
-| `screenshot_wait_for` | float | None | Additional wait time before taking a screenshot |
-| `screenshot_height_threshold` | int | 20000 | Threshold for page height to decide screenshot strategy |
-| `pdf` | bool | False | Whether to generate a PDF of the page |
-| `image_description_min_word_threshold` | int | 50 | Minimum words for image description extraction |
-| `image_score_threshold` | int | 3 | Minimum score threshold for processing an image |
-| `exclude_external_images` | bool | False | If True, exclude all external images from processing |
-
-## Link and Domain Handling Parameters
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `exclude_social_media_domains` | list[str] | SOCIAL_MEDIA_DOMAINS | List of domains to exclude for social media links |
-| `exclude_external_links` | bool | False | If True, exclude all external links from the results |
-| `exclude_social_media_links` | bool | False | If True, exclude links pointing to social media domains |
-| `exclude_domains` | list[str] | [] | List of specific domains to exclude from results |
-
-## Debugging and Logging Parameters
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `verbose` | bool | True | Enable verbose logging |
-| `log_console` | bool | False | If True, log console messages from the page |
\ No newline at end of file
diff --git a/docs/md_v2/api/crawl-result.md b/docs/md_v2/api/crawl-result.md
index 7e3bda98..7ed6275a 100644
--- a/docs/md_v2/api/crawl-result.md
+++ b/docs/md_v2/api/crawl-result.md
@@ -1,302 +1,355 @@
-# CrawlResult
+# `CrawlResult` Reference
 
-The `CrawlResult` class represents the result of a web crawling operation. It provides access to various forms of extracted content and metadata from the crawled webpage.
+The **`CrawlResult`** class encapsulates everything returned after a single crawl operation. It provides the **raw or processed content**, details on links and media, plus optional metadata (like screenshots, PDFs, or extracted JSON).
 
-## Class Definition
+**Location**: `crawl4ai/crawler/models.py` (for reference)
 
 ```python
 class CrawlResult(BaseModel):
-    """Result of a web crawling operation."""
-    
-    # Basic Information
-    url: str                                # Crawled URL
-    success: bool                           # Whether crawl succeeded
-    status_code: Optional[int] = None       # HTTP status code
-    error_message: Optional[str] = None     # Error message if failed
-    
-    # Content
-    html: str                              # Raw HTML content
-    cleaned_html: Optional[str] = None      # Cleaned HTML
-    fit_html: Optional[str] = None          # Most relevant HTML content
-    markdown: Optional[str] = None          # HTML converted to markdown
-    fit_markdown: Optional[str] = None      # Most relevant markdown content
-    downloaded_files: Optional[List[str]] = None  # Downloaded files
-    
-    # Extracted Data
-    extracted_content: Optional[str] = None  # Content from extraction strategy
-    media: Dict[str, List[Dict]] = {}       # Extracted media information
-    links: Dict[str, List[Dict]] = {}       # Extracted links
-    metadata: Optional[dict] = None         # Page metadata
-    
-    # Additional Data
-    screenshot: Optional[str] = None         # Base64 encoded screenshot
-    session_id: Optional[str] = None         # Session identifier
-    response_headers: Optional[dict] = None  # HTTP response headers
+    url: str
+    html: str
+    success: bool
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {}
+    links: Dict[str, List[Dict]] = {}
+    downloaded_files: Optional[List[str]] = None
+    screenshot: Optional[str] = None
+    pdf : Optional[bytes] = None
+    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
+    markdown_v2: Optional[MarkdownGenerationResult] = None
+    fit_markdown: Optional[str] = None
+    fit_html: Optional[str] = None
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
+    session_id: Optional[str] = None
+    response_headers: Optional[dict] = None
+    status_code: Optional[int] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    dispatch_result: Optional[DispatchResult] = None
+    ...
 ```
 
-## Properties and Their Data Structures
+Below is a **field-by-field** explanation and possible usage patterns.
 
-### Basic Information
+---
 
+## 1. Basic Crawl Info
+
+### 1.1 **`url`** *(str)*  
+**What**: The final crawled URL (after any redirects).  
+**Usage**:
 ```python
-# Access basic information
-result = await crawler.arun(url="https://example.com")
-
-print(result.url)          # "https://example.com"
-print(result.success)      # True/False
-print(result.status_code)  # 200, 404, etc.
-print(result.error_message)  # Error details if failed
+print(result.url)  # e.g., "https://example.com/"
 ```
 
-### Content Properties
-
-#### HTML Content
+### 1.2 **`success`** *(bool)*  
+**What**: `True` if the crawl pipeline ended without major errors; `False` otherwise.  
+**Usage**:
 ```python
-# Raw HTML
-html_content = result.html
-
-# Cleaned HTML (removed ads, popups, etc.)
-clean_content = result.cleaned_html
-
-# Most relevant HTML content
-main_content = result.fit_html
+if not result.success:
+    print(f"Crawl failed: {result.error_message}")
 ```
 
-#### Markdown Content
+### 1.3 **`status_code`** *(Optional[int])*  
+**What**: The page’s HTTP status code (e.g., 200, 404).  
+**Usage**:
 ```python
-# Full markdown version
-markdown_content = result.markdown
-
-# Most relevant markdown content
-main_content = result.fit_markdown
+if result.status_code == 404:
+    print("Page not found!")
 ```
 
-### Media Content
-
-The media dictionary contains organized media elements:
-
+### 1.4 **`error_message`** *(Optional[str])*  
+**What**: If `success=False`, a textual description of the failure.  
+**Usage**:
 ```python
-# Structure
-media = {
-    "images": [
-        {
-            "src": str,           # Image URL
-            "alt": str,           # Alt text
-            "desc": str,          # Contextual description
-            "score": float,       # Relevance score (0-10)
-            "type": str,          # "image"
-            "width": int,         # Image width (if available)
-            "height": int,        # Image height (if available)
-            "context": str,       # Surrounding text
-            "lazy": bool          # Whether image was lazy-loaded
-        }
-    ],
-    "videos": [
-        {
-            "src": str,           # Video URL
-            "type": str,          # "video"
-            "title": str,         # Video title
-            "poster": str,        # Thumbnail URL
-            "duration": str,      # Video duration
-            "description": str    # Video description
-        }
-    ],
-    "audios": [
-        {
-            "src": str,           # Audio URL
-            "type": str,          # "audio"
-            "title": str,         # Audio title
-            "duration": str,      # Audio duration
-            "description": str    # Audio description
-        }
-    ]
-}
-
-# Example usage
-for image in result.media["images"]:
-    if image["score"] > 5:  # High-relevance images
-        print(f"High-quality image: {image['src']}")
-        print(f"Context: {image['context']}")
+if not result.success:
+    print("Error:", result.error_message)
 ```
 
-### Link Analysis
-
-The links dictionary organizes discovered links:
-
+### 1.5 **`session_id`** *(Optional[str])*  
+**What**: The ID used for reusing a browser context across multiple calls.  
+**Usage**:
 ```python
-# Structure
-links = {
-    "internal": [
-        {
-            "href": str,          # URL
-            "text": str,          # Link text
-            "title": str,         # Title attribute
-            "type": str,          # Link type (nav, content, etc.)
-            "context": str,       # Surrounding text
-            "score": float        # Relevance score
-        }
-    ],
-    "external": [
-        {
-            "href": str,          # External URL
-            "text": str,          # Link text
-            "title": str,         # Title attribute
-            "domain": str,        # Domain name
-            "type": str,          # Link type
-            "context": str        # Surrounding text
-        }
-    ]
-}
+# If you used session_id="login_session" in CrawlerRunConfig, see it here:
+print("Session:", result.session_id)
+```
 
-# Example usage
+### 1.6 **`response_headers`** *(Optional[dict])*  
+**What**: Final HTTP response headers.  
+**Usage**:
+```python
+if result.response_headers:
+    print("Server:", result.response_headers.get("Server", "Unknown"))
+```
+
+### 1.7 **`ssl_certificate`** *(Optional[SSLCertificate])*  
+**What**: If `fetch_ssl_certificate=True` in your CrawlerRunConfig, **`result.ssl_certificate`** contains a  [**`SSLCertificate`**](../advanced/ssl-certificate.md) object describing the site’s certificate. You can export the cert in multiple formats (PEM/DER/JSON) or access its properties like `issuer`, 
+ `subject`, `valid_from`, `valid_until`, etc. 
+**Usage**:
+```python
+if result.ssl_certificate:
+    print("Issuer:", result.ssl_certificate.issuer)
+```
+
+---
+
+## 2. Raw / Cleaned Content
+
+### 2.1 **`html`** *(str)*  
+**What**: The **original** unmodified HTML from the final page load.  
+**Usage**:
+```python
+# Possibly large
+print(len(result.html))
+```
+
+### 2.2 **`cleaned_html`** *(Optional[str])*  
+**What**: A sanitized HTML version—scripts, styles, or excluded tags are removed based on your `CrawlerRunConfig`.  
+**Usage**:
+```python
+print(result.cleaned_html[:500])  # Show a snippet
+```
+
+### 2.3 **`fit_html`** *(Optional[str])*  
+**What**: If a **content filter** or heuristic (e.g., Pruning/BM25) modifies the HTML, the “fit” or post-filter version.  
+**When**: This is **only** present if your `markdown_generator` or `content_filter` produces it.  
+**Usage**:
+```python
+if result.fit_html:
+    print("High-value HTML content:", result.fit_html[:300])
+```
+
+---
+
+## 3. Markdown Fields
+
+### 3.1 The Markdown Generation Approach
+
+Crawl4AI can convert HTML→Markdown, optionally including:
+
+- **Raw** markdown  
+- **Links as citations** (with a references section)  
+- **Fit** markdown if a **content filter** is used (like Pruning or BM25)
+
+### 3.2 **`markdown_v2`** *(Optional[MarkdownGenerationResult])*  
+**What**: The **structured** object holding multiple markdown variants. Soon to be consolidated into `markdown`.  
+
+**`MarkdownGenerationResult`** includes:
+- **`raw_markdown`** *(str)*: The full HTML→Markdown conversion.  
+- **`markdown_with_citations`** *(str)*: Same markdown, but with link references as academic-style citations.  
+- **`references_markdown`** *(str)*: The reference list or footnotes at the end.  
+- **`fit_markdown`** *(Optional[str])*: If content filtering (Pruning/BM25) was applied, the filtered “fit” text.  
+- **`fit_html`** *(Optional[str])*: The HTML that led to `fit_markdown`.
+
+**Usage**:
+```python
+if result.markdown_v2:
+    md_res = result.markdown_v2
+    print("Raw MD:", md_res.raw_markdown[:300])
+    print("Citations MD:", md_res.markdown_with_citations[:300])
+    print("References:", md_res.references_markdown)
+    if md_res.fit_markdown:
+        print("Pruned text:", md_res.fit_markdown[:300])
+```
+
+### 3.3 **`markdown`** *(Optional[Union[str, MarkdownGenerationResult]])*  
+**What**: In future versions, `markdown` will fully replace `markdown_v2`. Right now, it might be a `str` or a `MarkdownGenerationResult`.  
+**Usage**:
+```python
+# Soon, you might see:
+if isinstance(result.markdown, MarkdownGenerationResult):
+    print(result.markdown.raw_markdown[:200])
+else:
+    print(result.markdown)
+```
+
+### 3.4 **`fit_markdown`** *(Optional[str])*  
+**What**: A direct reference to the final filtered markdown (legacy approach).  
+**When**: This is set if a filter or content strategy explicitly writes there. Usually overshadowed by `markdown_v2.fit_markdown`.  
+**Usage**:
+```python
+print(result.fit_markdown)  # Legacy field, prefer result.markdown_v2.fit_markdown
+```
+
+**Important**: “Fit” content (in `fit_markdown`/`fit_html`) only exists if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`.
+
+---
+
+## 4. Media & Links
+
+### 4.1 **`media`** *(Dict[str, List[Dict]])*  
+**What**: Contains info about discovered images, videos, or audio. Typically keys: `"images"`, `"videos"`, `"audios"`.  
+**Common Fields** in each item:
+
+- `src` *(str)*: Media URL  
+- `alt` or `title` *(str)*: Descriptive text  
+- `score` *(float)*: Relevance score if the crawler’s heuristic found it “important”  
+- `desc` or `description` *(Optional[str])*: Additional context extracted from surrounding text  
+
+**Usage**:
+```python
+images = result.media.get("images", [])
+for img in images:
+    if img.get("score", 0) > 5:
+        print("High-value image:", img["src"])
+```
+
+### 4.2 **`links`** *(Dict[str, List[Dict]])*  
+**What**: Holds internal and external link data. Usually two keys: `"internal"` and `"external"`.  
+**Common Fields**:
+
+- `href` *(str)*: The link target  
+- `text` *(str)*: Link text  
+- `title` *(str)*: Title attribute  
+- `context` *(str)*: Surrounding text snippet  
+- `domain` *(str)*: If external, the domain
+
+**Usage**:
+```python
 for link in result.links["internal"]:
-    print(f"Internal link: {link['href']}")
-    print(f"Context: {link['context']}")
+    print(f"Internal link to {link['href']} with text {link['text']}")
 ```
 
-### Metadata
+---
 
-The metadata dictionary contains page information:
+## 5. Additional Fields
 
+### 5.1 **`extracted_content`** *(Optional[str])*  
+**What**: If you used **`extraction_strategy`** (CSS, LLM, etc.), the structured output (JSON).  
+**Usage**:
 ```python
-# Structure
-metadata = {
-    "title": str,                # Page title
-    "description": str,          # Meta description
-    "keywords": List[str],       # Meta keywords
-    "author": str,              # Author information
-    "published_date": str,      # Publication date
-    "modified_date": str,       # Last modified date
-    "language": str,            # Page language
-    "canonical_url": str,       # Canonical URL
-    "og_data": Dict,           # Open Graph data
-    "twitter_data": Dict       # Twitter card data
-}
-
-# Example usage
-if result.metadata:
-    print(f"Title: {result.metadata['title']}")
-    print(f"Author: {result.metadata.get('author', 'Unknown')}")
-```
-
-### Extracted Content
-
-Content from extraction strategies:
-
-```python
-# For LLM or CSS extraction strategies
 if result.extracted_content:
-    structured_data = json.loads(result.extracted_content)
-    print(structured_data)
+    data = json.loads(result.extracted_content)
+    print(data)
 ```
 
-### Screenshot
-
-Base64 encoded screenshot:
-
+### 5.2 **`downloaded_files`** *(Optional[List[str]])*  
+**What**: If `accept_downloads=True` in your `BrowserConfig` + `downloads_path`, lists local file paths for downloaded items.  
+**Usage**:
 ```python
-# Save screenshot if available
+if result.downloaded_files:
+    for file_path in result.downloaded_files:
+        print("Downloaded:", file_path)
+```
+
+### 5.3 **`screenshot`** *(Optional[str])*  
+**What**: Base64-encoded screenshot if `screenshot=True` in `CrawlerRunConfig`.  
+**Usage**:
+```python
+import base64
 if result.screenshot:
-    import base64
-    
-    # Decode and save
-    with open("screenshot.png", "wb") as f:
+    with open("page.png", "wb") as f:
         f.write(base64.b64decode(result.screenshot))
 ```
 
-## Usage Examples
-
-### Basic Content Access
+### 5.4 **`pdf`** *(Optional[bytes])*  
+**What**: Raw PDF bytes if `pdf=True` in `CrawlerRunConfig`.  
+**Usage**:
 ```python
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun(url="https://example.com")
+if result.pdf:
+    with open("page.pdf", "wb") as f:
+        f.write(result.pdf)
+```
+
+### 5.5 **`metadata`** *(Optional[dict])*  
+**What**: Page-level metadata if discovered (title, description, OG data, etc.).  
+**Usage**:
+```python
+if result.metadata:
+    print("Title:", result.metadata.get("title"))
+    print("Author:", result.metadata.get("author"))
+```
+
+---
+
+## 6. `dispatch_result` (optional)
+
+A `DispatchResult` object providing additional concurrency and resource usage information when crawling URLs in parallel (e.g., via `arun_many()` with custom dispatchers). It contains:
+
+- **`task_id`**: A unique identifier for the parallel task.
+- **`memory_usage`** (float): The memory (in MB) used at the time of completion.
+- **`peak_memory`** (float): The peak memory usage (in MB) recorded during the task’s execution.
+- **`start_time`** / **`end_time`** (datetime): Time range for this crawling task.
+- **`error_message`** (str): Any dispatcher- or concurrency-related error encountered.
+
+```python
+# Example usage:
+for result in results:
+    if result.success and result.dispatch_result:
+        dr = result.dispatch_result
+        print(f"URL: {result.url}, Task ID: {dr.task_id}")
+        print(f"Memory: {dr.memory_usage:.1f} MB (Peak: {dr.peak_memory:.1f} MB)")
+        print(f"Duration: {dr.end_time - dr.start_time}")
+```
+
+> **Note**: This field is typically populated when using `arun_many(...)` alongside a **dispatcher** (e.g., `MemoryAdaptiveDispatcher` or `SemaphoreDispatcher`). If no concurrency or dispatcher is used, `dispatch_result` may remain `None`. 
+
+---
+
+## 7. Example: Accessing Everything
+
+```python
+async def handle_result(result: CrawlResult):
+    if not result.success:
+        print("Crawl error:", result.error_message)
+        return
     
-    if result.success:
-        # Get clean content
-        print(result.fit_markdown)
-        
-        # Process images
-        for image in result.media["images"]:
-            if image["score"] > 7:
-                print(f"High-quality image: {image['src']}")
+    # Basic info
+    print("Crawled URL:", result.url)
+    print("Status code:", result.status_code)
+    
+    # HTML
+    print("Original HTML size:", len(result.html))
+    print("Cleaned HTML size:", len(result.cleaned_html or ""))
+
+    # Markdown output
+    if result.markdown_v2:
+        print("Raw Markdown:", result.markdown_v2.raw_markdown[:300])
+        print("Citations Markdown:", result.markdown_v2.markdown_with_citations[:300])
+        if result.markdown_v2.fit_markdown:
+            print("Fit Markdown:", result.markdown_v2.fit_markdown[:200])
+    else:
+        print("Raw Markdown (legacy):", result.markdown[:200] if result.markdown else "N/A")
+
+    # Media & Links
+    if "images" in result.media:
+        print("Image count:", len(result.media["images"]))
+    if "internal" in result.links:
+        print("Internal link count:", len(result.links["internal"]))
+
+    # Extraction strategy result
+    if result.extracted_content:
+        print("Structured data:", result.extracted_content)
+    
+    # Screenshot/PDF
+    if result.screenshot:
+        print("Screenshot length:", len(result.screenshot))
+    if result.pdf:
+        print("PDF bytes length:", len(result.pdf))
 ```
 
-### Complete Data Processing
-```python
-async def process_webpage(url: str) -> Dict:
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(url=url)
-        
-        if not result.success:
-            raise Exception(f"Crawl failed: {result.error_message}")
-        
-        return {
-            "content": result.fit_markdown,
-            "images": [
-                img for img in result.media["images"]
-                if img["score"] > 5
-            ],
-            "internal_links": [
-                link["href"] for link in result.links["internal"]
-            ],
-            "metadata": result.metadata,
-            "status": result.status_code
-        }
-```
+---
 
-### Error Handling
-```python
-async def safe_crawl(url: str) -> Dict:
-    async with AsyncWebCrawler() as crawler:
-        try:
-            result = await crawler.arun(url=url)
-            
-            if not result.success:
-                return {
-                    "success": False,
-                    "error": result.error_message,
-                    "status": result.status_code
-                }
-            
-            return {
-                "success": True,
-                "content": result.fit_markdown,
-                "status": result.status_code
-            }
-            
-        except Exception as e:
-            return {
-                "success": False,
-                "error": str(e),
-                "status": None
-            }
-```
+## 8. Key Points & Future
 
-## Best Practices
+1. **`markdown_v2` vs `markdown`**  
+   - Right now, `markdown_v2` is the more robust container (`MarkdownGenerationResult`), providing **raw_markdown**, **markdown_with_citations**, references, plus possible **fit_markdown**.  
+   - In future versions, everything will unify under **`markdown`**. If you rely on advanced features (citations, fit content), check `markdown_v2`.
 
-1. **Always Check Success**
-```python
-if not result.success:
-    print(f"Error: {result.error_message}")
-    return
-```
+2. **Fit Content**  
+   - **`fit_markdown`** and **`fit_html`** appear only if you used a content filter (like **PruningContentFilter** or **BM25ContentFilter**) inside your **MarkdownGenerationStrategy** or set them directly.  
+   - If no filter is used, they remain `None`.
 
-2. **Use fit_markdown for Articles**
-```python
-# Better for article content
-content = result.fit_markdown if result.fit_markdown else result.markdown
-```
+3. **References & Citations**  
+   - If you enable link citations in your `DefaultMarkdownGenerator` (`options={"citations": True}`), you’ll see `markdown_with_citations` plus a **`references_markdown`** block. This helps large language models or academic-like referencing.
 
-3. **Filter Media by Score**
-```python
-relevant_images = [
-    img for img in result.media["images"]
-    if img["score"] > 5
-]
-```
+4. **Links & Media**  
+   - `links["internal"]` and `links["external"]` group discovered anchors by domain.  
+   - `media["images"]` / `["videos"]` / `["audios"]` store extracted media elements with optional scoring or context.
 
-4. **Handle Missing Data**
-```python
-metadata = result.metadata or {}
-title = metadata.get('title', 'Unknown Title')
-```
\ No newline at end of file
+5. **Error Cases**  
+   - If `success=False`, check `error_message` (e.g., timeouts, invalid URLs).  
+   - `status_code` might be `None` if we failed before an HTTP response.
+
+Use **`CrawlResult`** to glean all final outputs and feed them into your data pipelines, AI models, or archives. With the synergy of a properly configured **BrowserConfig** and **CrawlerRunConfig**, the crawler can produce robust, structured results here in **`CrawlResult`**.
\ No newline at end of file
diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md
index c1c4d2ea..3ff5bb53 100644
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -1,36 +1,291 @@
-# Parameter Reference Table
+# 1. **BrowserConfig** – Controlling the Browser
 
-| File Name | Parameter Name | Code Usage | Strategy/Class | Description |
-|-----------|---------------|------------|----------------|-------------|
-| async_crawler_strategy.py | user_agent | `kwargs.get("user_agent")` | AsyncPlaywrightCrawlerStrategy | User agent string for browser identification |
-| async_crawler_strategy.py | proxy | `kwargs.get("proxy")` | AsyncPlaywrightCrawlerStrategy | Proxy server configuration for network requests |
-| async_crawler_strategy.py | proxy_config | `kwargs.get("proxy_config")` | AsyncPlaywrightCrawlerStrategy | Detailed proxy configuration including auth |
-| async_crawler_strategy.py | headless | `kwargs.get("headless", True)` | AsyncPlaywrightCrawlerStrategy | Whether to run browser in headless mode |
-| async_crawler_strategy.py | browser_type | `kwargs.get("browser_type", "chromium")` | AsyncPlaywrightCrawlerStrategy | Type of browser to use (chromium/firefox/webkit) |
-| async_crawler_strategy.py | headers | `kwargs.get("headers", {})` | AsyncPlaywrightCrawlerStrategy | Custom HTTP headers for requests |
-| async_crawler_strategy.py | verbose | `kwargs.get("verbose", False)` | AsyncPlaywrightCrawlerStrategy | Enable detailed logging output |
-| async_crawler_strategy.py | sleep_on_close | `kwargs.get("sleep_on_close", False)` | AsyncPlaywrightCrawlerStrategy | Add delay before closing browser |
-| async_crawler_strategy.py | use_managed_browser | `kwargs.get("use_managed_browser", False)` | AsyncPlaywrightCrawlerStrategy | Use managed browser instance |
-| async_crawler_strategy.py | user_data_dir | `kwargs.get("user_data_dir", None)` | AsyncPlaywrightCrawlerStrategy | Custom directory for browser profile data |
-| async_crawler_strategy.py | session_id | `kwargs.get("session_id")` | AsyncPlaywrightCrawlerStrategy | Unique identifier for browser session |
-| async_crawler_strategy.py | override_navigator | `kwargs.get("override_navigator", False)` | AsyncPlaywrightCrawlerStrategy | Override browser navigator properties |
-| async_crawler_strategy.py | simulate_user | `kwargs.get("simulate_user", False)` | AsyncPlaywrightCrawlerStrategy | Simulate human-like behavior |
-| async_crawler_strategy.py | magic | `kwargs.get("magic", False)` | AsyncPlaywrightCrawlerStrategy | Enable advanced anti-detection features |
-| async_crawler_strategy.py | log_console | `kwargs.get("log_console", False)` | AsyncPlaywrightCrawlerStrategy | Log browser console messages |
-| async_crawler_strategy.py | js_only | `kwargs.get("js_only", False)` | AsyncPlaywrightCrawlerStrategy | Only execute JavaScript without page load |
-| async_crawler_strategy.py | page_timeout | `kwargs.get("page_timeout", 60000)` | AsyncPlaywrightCrawlerStrategy | Timeout for page load in milliseconds |
-| async_crawler_strategy.py | ignore_body_visibility | `kwargs.get("ignore_body_visibility", True)` | AsyncPlaywrightCrawlerStrategy | Process page even if body is hidden |
-| async_crawler_strategy.py | js_code | `kwargs.get("js_code", kwargs.get("js", self.js_code))` | AsyncPlaywrightCrawlerStrategy | Custom JavaScript code to execute |
-| async_crawler_strategy.py | wait_for | `kwargs.get("wait_for")` | AsyncPlaywrightCrawlerStrategy | Wait for specific element/condition |
-| async_crawler_strategy.py | process_iframes | `kwargs.get("process_iframes", False)` | AsyncPlaywrightCrawlerStrategy | Extract content from iframes |
-| async_crawler_strategy.py | delay_before_return_html | `kwargs.get("delay_before_return_html")` | AsyncPlaywrightCrawlerStrategy | Additional delay before returning HTML |
-| async_crawler_strategy.py | remove_overlay_elements | `kwargs.get("remove_overlay_elements", False)` | AsyncPlaywrightCrawlerStrategy | Remove pop-ups and overlay elements |
-| async_crawler_strategy.py | screenshot | `kwargs.get("screenshot")` | AsyncPlaywrightCrawlerStrategy | Take page screenshot |
-| async_crawler_strategy.py | screenshot_wait_for | `kwargs.get("screenshot_wait_for")` | AsyncPlaywrightCrawlerStrategy | Wait before taking screenshot |
-| async_crawler_strategy.py | semaphore_count | `kwargs.get("semaphore_count", 5)` | AsyncPlaywrightCrawlerStrategy | Concurrent request limit |
-| async_webcrawler.py | verbose | `kwargs.get("verbose", False)` | AsyncWebCrawler | Enable detailed logging |
-| async_webcrawler.py | warmup | `kwargs.get("warmup", True)` | AsyncWebCrawler | Initialize crawler with warmup request |
-| async_webcrawler.py | session_id | `kwargs.get("session_id", None)` | AsyncWebCrawler | Session identifier for browser reuse |
-| async_webcrawler.py | only_text | `kwargs.get("only_text", False)` | AsyncWebCrawler | Extract only text content |
-| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl |
-| async_webcrawler.py | cache_mode | `kwargs.get("cache_mode", CacheMode.ENABLE)` | AsyncWebCrawler | Cache handling mode for request |
\ No newline at end of file
+`BrowserConfig` focuses on **how** the browser is launched and behaves. This includes headless mode, proxies, user agents, and other environment tweaks.
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+browser_cfg = BrowserConfig(
+    browser_type="chromium",
+    headless=True,
+    viewport_width=1280,
+    viewport_height=720,
+    proxy="http://user:pass@proxy:8080",
+    user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36",
+)
+```
+
+## 1.1 Parameter Highlights
+
+| **Parameter**         | **Type / Default**                     | **What It Does**                                                                                                                     |
+|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
+| **`browser_type`**    | `"chromium"`, `"firefox"`, `"webkit"`<br/>*(default: `"chromium"`)* | Which browser engine to use. `"chromium"` is typical for many sites, `"firefox"` or `"webkit"` for specialized tests.                 |
+| **`headless`**        | `bool` (default: `True`)               | Headless means no visible UI. `False` is handy for debugging.                                                                         |
+| **`viewport_width`**  | `int` (default: `1080`)                | Initial page width (in px). Useful for testing responsive layouts.                                                                    |
+| **`viewport_height`** | `int` (default: `600`)                 | Initial page height (in px).                                                                                                          |
+| **`proxy`**           | `str` (default: `None`)                | Single-proxy URL if you want all traffic to go through it, e.g. `"http://user:pass@proxy:8080"`.                                      |
+| **`proxy_config`**    | `dict` (default: `None`)               | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`.                                  |
+| **`use_persistent_context`** | `bool` (default: `False`)       | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`.          |
+| **`user_data_dir`**   | `str or None` (default: `None`)        | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions.                                         |
+| **`ignore_https_errors`** | `bool` (default: `True`)           | If `True`, continues despite invalid certificates (common in dev/staging).                                                            |
+| **`java_script_enabled`** | `bool` (default: `True`)           | Disable if you want no JS overhead, or if only static content is needed.                                                              |
+| **`cookies`**         | `list` (default: `[]`)                 | Pre-set cookies, each a dict like `{"name": "session", "value": "...", "url": "..."}`.                                                |
+| **`headers`**         | `dict` (default: `{}`)                 | Extra HTTP headers for every request, e.g. `{"Accept-Language": "en-US"}`.                                                            |
+| **`user_agent`**      | `str` (default: Chrome-based UA)       | Your custom or random user agent. `user_agent_mode="random"` can shuffle it.                                                          |
+| **`light_mode`**      | `bool` (default: `False`)              | Disables some background features for performance gains.                                                                              |
+| **`text_mode`**       | `bool` (default: `False`)              | If `True`, tries to disable images/other heavy content for speed.                                                                     |
+| **`use_managed_browser`** | `bool` (default: `False`)          | For advanced “managed” interactions (debugging, CDP usage). Typically set automatically if persistent context is on.                  |
+| **`extra_args`**      | `list` (default: `[]`)                 | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`.                                                |
+
+**Tips**:
+- Set `headless=False` to visually **debug** how pages load or how interactions proceed.  
+- If you need **authentication** storage or repeated sessions, consider `use_persistent_context=True` and specify `user_data_dir`.  
+- For large pages, you might need a bigger `viewport_width` and `viewport_height` to handle dynamic content.
+
+---
+
+# 2. **CrawlerRunConfig** – Controlling Each Crawl
+
+While `BrowserConfig` sets up the **environment**, `CrawlerRunConfig` details **how** each **crawl operation** should behave: caching, content filtering, link or domain blocking, timeouts, JavaScript code, etc.
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+run_cfg = CrawlerRunConfig(
+    wait_for="css:.main-content",
+    word_count_threshold=15,
+    excluded_tags=["nav", "footer"],
+    exclude_external_links=True,
+    stream=True,  # Enable streaming for arun_many()
+)
+```
+
+## 2.1 Parameter Highlights
+
+We group them by category. 
+
+### A) **Content Processing**
+
+| **Parameter**                | **Type / Default**                   | **What It Does**                                                                                |
+|------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
+| **`word_count_threshold`**   | `int` (default: ~200)                | Skips text blocks below X words. Helps ignore trivial sections.                                 |
+| **`extraction_strategy`**    | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.).                                  |
+| **`markdown_generator`**     | `MarkdownGenerationStrategy` (None)  | If you want specialized markdown output (citations, filtering, chunking, etc.).                 |
+| **`content_filter`**         | `RelevantContentFilter` (None)       | Filters out irrelevant text blocks. E.g., `PruningContentFilter` or `BM25ContentFilter`.        |
+| **`css_selector`**           | `str` (None)                         | Retains only the part of the page matching this selector.                                       |
+| **`excluded_tags`**          | `list` (None)                        | Removes entire tags (e.g. `["script", "style"]`).                                               |
+| **`excluded_selector`**      | `str` (None)                         | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`.                                    |
+| **`only_text`**              | `bool` (False)                       | If `True`, tries to extract text-only content.                                                  |
+| **`prettiify`**              | `bool` (False)                       | If `True`, beautifies final HTML (slower, purely cosmetic).                                      |
+| **`keep_data_attributes`**   | `bool` (False)                       | If `True`, preserve `data-*` attributes in cleaned HTML.                                         |
+| **`remove_forms`**           | `bool` (False)                       | If `True`, remove all `<form>` elements.                                                        |
+
+---
+
+### B) **Caching & Session**
+
+| **Parameter**           | **Type / Default**     | **What It Does**                                                                                                              |
+|-------------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------|
+| **`cache_mode`**        | `CacheMode or None`    | Controls how caching is handled (`ENABLED`, `BYPASS`, `DISABLED`, etc.). If `None`, typically defaults to `ENABLED`.          |
+| **`session_id`**        | `str or None`          | Assign a unique ID to reuse a single browser session across multiple `arun()` calls.                                          |
+| **`bypass_cache`**      | `bool` (False)         | If `True`, acts like `CacheMode.BYPASS`.                                                                                     |
+| **`disable_cache`**     | `bool` (False)         | If `True`, acts like `CacheMode.DISABLED`.                                                                                   |
+| **`no_cache_read`**     | `bool` (False)         | If `True`, acts like `CacheMode.WRITE_ONLY` (writes cache but never reads).                                                  |
+| **`no_cache_write`**    | `bool` (False)         | If `True`, acts like `CacheMode.READ_ONLY` (reads cache but never writes).                                                   |
+
+Use these for controlling whether you read or write from a local content cache. Handy for large batch crawls or repeated site visits.
+
+---
+
+### C) **Page Navigation & Timing**
+
+| **Parameter**              | **Type / Default**      | **What It Does**                                                                                                    |
+|----------------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------|
+| **`wait_until`**           | `str` (domcontentloaded)| Condition for navigation to “complete”. Often `"networkidle"` or `"domcontentloaded"`.                               |
+| **`page_timeout`**         | `int` (60000 ms)        | Timeout for page navigation or JS steps. Increase for slow sites.                                                    |
+| **`wait_for`**             | `str or None`           | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction.                     |
+| **`wait_for_images`**      | `bool` (False)          | Wait for images to load before finishing. Slows down if you only want text.                                          |
+| **`delay_before_return_html`** | `float` (0.1)       | Additional pause (seconds) before final HTML is captured. Good for last-second updates.                               |
+| **`mean_delay`** and **`max_range`** | `float` (0.1, 0.3) | If you call `arun_many()`, these define random delay intervals between crawls, helping avoid detection or rate limits. |
+| **`semaphore_count`**      | `int` (5)               | Max concurrency for `arun_many()`. Increase if you have resources for parallel crawls.                                |
+
+---
+
+### D) **Page Interaction**
+
+| **Parameter**              | **Type / Default**            | **What It Does**                                                                                                                       |
+|----------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
+| **`js_code`**              | `str or list[str]` (None)      | JavaScript to run after load. E.g. `"document.querySelector('button')?.click();"`.                                                     |
+| **`js_only`**              | `bool` (False)                 | If `True`, indicates we’re reusing an existing session and only applying JS. No full reload.                                           |
+| **`ignore_body_visibility`** | `bool` (True)                | Skip checking if `<body>` is visible. Usually best to keep `True`.                                                                     |
+| **`scan_full_page`**       | `bool` (False)                 | If `True`, auto-scroll the page to load dynamic content (infinite scroll).                                                              |
+| **`scroll_delay`**         | `float` (0.2)                  | Delay between scroll steps if `scan_full_page=True`.                                                                                   |
+| **`process_iframes`**      | `bool` (False)                 | Inlines iframe content for single-page extraction.                                                                                     |
+| **`remove_overlay_elements`** | `bool` (False)              | Removes potential modals/popups blocking the main content.                                                                              |
+| **`simulate_user`**        | `bool` (False)                 | Simulate user interactions (mouse movements) to avoid bot detection.                                                                    |
+| **`override_navigator`**   | `bool` (False)                 | Override `navigator` properties in JS for stealth.                                                                                      |
+| **`magic`**                | `bool` (False)                 | Automatic handling of popups/consent banners. Experimental.                                                                             |
+| **`adjust_viewport_to_content`** | `bool` (False)           | Resizes viewport to match page content height.                                                                                          |
+
+If your page is a single-page app with repeated JS updates, set `js_only=True` in subsequent calls, plus a `session_id` for reusing the same tab.
+
+---
+
+### E) **Media Handling**
+
+| **Parameter**                              | **Type / Default**  | **What It Does**                                                                                         |
+|--------------------------------------------|---------------------|-----------------------------------------------------------------------------------------------------------|
+| **`screenshot`**                           | `bool` (False)      | Capture a screenshot (base64) in `result.screenshot`.                                                     |
+| **`screenshot_wait_for`**                  | `float or None`     | Extra wait time before the screenshot.                                                                    |
+| **`screenshot_height_threshold`**          | `int` (~20000)      | If the page is taller than this, alternate screenshot strategies are used.                                |
+| **`pdf`**                                  | `bool` (False)      | If `True`, returns a PDF in `result.pdf`.                                                                 |
+| **`image_description_min_word_threshold`** | `int` (~50)         | Minimum words for an image’s alt text or description to be considered valid.                              |
+| **`image_score_threshold`**                | `int` (~3)          | Filter out low-scoring images. The crawler scores images by relevance (size, context, etc.).              |
+| **`exclude_external_images`**              | `bool` (False)      | Exclude images from other domains.                                                                        |
+
+---
+
+### F) **Link/Domain Handling**
+
+| **Parameter**                | **Type / Default**      | **What It Does**                                                                                                             |
+|------------------------------|-------------------------|-----------------------------------------------------------------------------------------------------------------------------|
+| **`exclude_social_media_domains`** | `list` (e.g. Facebook/Twitter) | A default list can be extended. Any link to these domains is removed from final output.                                      |
+| **`exclude_external_links`** | `bool` (False)          | Removes all links pointing outside the current domain.                                                                      |
+| **`exclude_social_media_links`** | `bool` (False)      | Strips links specifically to social sites (like Facebook or Twitter).                                                      |
+| **`exclude_domains`**        | `list` ([])             | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`).                                            |
+
+Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
+
+---
+
+### G) **Rate Limiting & Resource Management**
+
+| **Parameter**                | **Type / Default**                     | **What It Does**                                                                                                           |
+|------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
+| **`enable_rate_limiting`**  | `bool` (default: `False`)              | Enable intelligent rate limiting for multiple URLs                                                                          |
+| **`rate_limit_config`**     | `RateLimitConfig` (default: `None`)    | Configuration for rate limiting behavior                                                                                   |
+
+The `RateLimitConfig` class has these fields:
+
+| **Field**           | **Type / Default**                     | **What It Does**                                                                                                           |
+|--------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
+| **`base_delay`**   | `Tuple[float, float]` (1.0, 3.0)      | Random delay range between requests to the same domain                                                                      |
+| **`max_delay`**    | `float` (60.0)                        | Maximum delay after rate limit detection                                                                                    |
+| **`max_retries`**  | `int` (3)                             | Number of retries before giving up on rate-limited requests                                                                 |
+| **`rate_limit_codes`** | `List[int]` ([429, 503])          | HTTP status codes that trigger rate limiting behavior                                                                       |
+
+| **Parameter**                  | **Type / Default**                     | **What It Does**                                                                                                           |
+|-------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
+| **`memory_threshold_percent`** | `float` (70.0)                        | Maximum memory usage before pausing new crawls                                                                              |
+| **`check_interval`**          | `float` (1.0)                         | How often to check system resources (in seconds)                                                                           |
+| **`max_session_permit`**      | `int` (20)                            | Maximum number of concurrent crawl sessions                                                                                |
+| **`display_mode`**            | `str` (`None`, "DETAILED", "AGGREGATED") | How to display progress information                                                                                     |
+
+---
+
+### H) **Debug & Logging**
+
+| **Parameter**  | **Type / Default** | **What It Does**                                                         |
+|----------------|--------------------|---------------------------------------------------------------------------|
+| **`verbose`**  | `bool` (True)     | Prints logs detailing each step of crawling, interactions, or errors.    |
+| **`log_console`** | `bool` (False) | Logs the page’s JavaScript console output if you want deeper JS debugging.|
+
+---
+
+## 2.2 Helper Methods
+
+Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies:
+
+```python
+# Create a base configuration
+base_config = CrawlerRunConfig(
+    cache_mode=CacheMode.ENABLED,
+    word_count_threshold=200
+)
+
+# Create variations using clone()
+stream_config = base_config.clone(stream=True)
+no_cache_config = base_config.clone(
+    cache_mode=CacheMode.BYPASS,
+    stream=True
+)
+```
+
+The `clone()` method is particularly useful when you need slightly different configurations for different use cases, without modifying the original config.
+
+## 2.3 Example Usage
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, RateLimitConfig
+
+async def main():
+    # Configure the browser
+    browser_cfg = BrowserConfig(
+        headless=False,
+        viewport_width=1280,
+        viewport_height=720,
+        proxy="http://user:pass@myproxy:8080",
+        text_mode=True
+    )
+
+    # Configure the run
+    run_cfg = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        session_id="my_session",
+        css_selector="main.article",
+        excluded_tags=["script", "style"],
+        exclude_external_links=True,
+        wait_for="css:.article-loaded",
+        screenshot=True,
+        enable_rate_limiting=True,
+        rate_limit_config=RateLimitConfig(
+            base_delay=(1.0, 3.0),
+            max_delay=60.0,
+            max_retries=3,
+            rate_limit_codes=[429, 503]
+        ),
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=20,
+        display_mode="DETAILED",
+        stream=True
+    )
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        result = await crawler.arun(
+            url="https://example.com/news",
+            config=run_cfg
+        )
+        if result.success:
+            print("Final cleaned_html length:", len(result.cleaned_html))
+            if result.screenshot:
+                print("Screenshot captured (base64, length):", len(result.screenshot))
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What’s Happening**:
+- **`text_mode=True`** avoids loading images and other heavy resources, speeding up the crawl.  
+- We disable caching (`cache_mode=CacheMode.BYPASS`) to always fetch fresh content.  
+- We only keep `main.article` content by specifying `css_selector="main.article"`.  
+- We exclude external links (`exclude_external_links=True`).  
+- We do a quick screenshot (`screenshot=True`) before finishing.
+
+---
+
+## 3. Putting It All Together
+
+- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.  
+- **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.  
+- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).  
+
+```python
+# Create a modified copy with the clone() method
+stream_cfg = run_cfg.clone(
+    stream=True,
+    cache_mode=CacheMode.BYPASS
+)
diff --git a/docs/md_v2/api/strategies.md b/docs/md_v2/api/strategies.md
index f0f8f57c..06b757d4 100644
--- a/docs/md_v2/api/strategies.md
+++ b/docs/md_v2/api/strategies.md
@@ -218,12 +218,12 @@ result = await crawler.arun(
 
 ## Best Practices
 
-1. **Choose the Right Strategy**
+1. **Choose the Right Strategy**
    - Use `LLMExtractionStrategy` for complex, unstructured content
    - Use `JsonCssExtractionStrategy` for well-structured HTML
    - Use `CosineStrategy` for content similarity and clustering
 
-2. **Optimize Chunking**
+2. **Optimize Chunking**
    ```python
    # For long documents
    strategy = LLMExtractionStrategy(
@@ -232,7 +232,7 @@ result = await crawler.arun(
    )
    ```
 
-3. **Handle Errors**
+3. **Handle Errors**
    ```python
    try:
        result = await crawler.arun(
@@ -245,7 +245,7 @@ result = await crawler.arun(
        print(f"Extraction failed: {e}")
    ```
 
-4. **Monitor Performance**
+4. **Monitor Performance**
    ```python
    strategy = CosineStrategy(
        verbose=True,  # Enable logging
diff --git a/docs/md_v2/assets/images/dispatcher.png b/docs/md_v2/assets/images/dispatcher.png
new file mode 100644
index 00000000..37e06972
Binary files /dev/null and b/docs/md_v2/assets/images/dispatcher.png differ
diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css
index 68a93f5d..ed7fc12e 100644
--- a/docs/md_v2/assets/styles.css
+++ b/docs/md_v2/assets/styles.css
@@ -7,6 +7,7 @@
 
 :root {
     --global-font-size: 16px;
+    --global-code-font-size: 16px;
     --global-line-height: 1.5em;
     --global-space: 10px;
     --font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
@@ -20,6 +21,7 @@
     --invert-font-color: #151515; /* Dark color for inverted elements */
     --primary-color: #1a95e0; /* Primary color can remain the same or be adjusted for better contrast */
     --secondary-color: #727578; /* Secondary color for less important text */
+    --secondary-dimmed-color: #8b857a; /* Dimmed secondary color */
     --error-color: #ff5555; /* Bright color for errors */
     --progress-bar-background: #444; /* Darker background for progress bar */
     --progress-bar-fill: #1a95e0; /* Bright color for progress bar fill */
@@ -37,8 +39,9 @@
     --secondary-color: #a3abba;
     --secondary-color: #d5cec0;
     --tertiary-color: #a3abba;
-    --primary-color: #09b5a5; /* Updated to the brand color */
+    --primary-dimmed-color: #09b5a5; /* Updated to the brand color */
     --primary-color: #50ffff; /* Updated to the brand color */
+    --accent-color: rgb(243, 128, 245);
     --error-color: #ff3c74;
     --progress-bar-background: #3f3f44;
     --progress-bar-fill: #09b5a5; /* Updated to the brand color */
@@ -80,10 +83,16 @@ pre, code {
     line-height: var(--global-line-height);
 }
 
-strong,
+strong {
+    /* color : var(--primary-dimmed-color); */
+    /* background-color: #50ffff17; */
+    text-shadow: 0 0 0px var(--font-color), 0 0 0px var(--font-color);
+}
+
 .highlight {
     /* background: url(//s2.svgbox.net/pen-brushes.svg?ic=brush-1&color=50ffff); */
-    background-color: #50ffff33;
+    background-color: #50ffff17;
+    
 }
 
 .terminal-card > header {
@@ -157,4 +166,80 @@ ol li::before {
     counter-increment: item;
     /* float: left; */
     /* padding-right: 5px; */
+}
+
+
+/* 8 TERMINAL CSS */
+
+.terminal code {
+    font-size: var(--global-code-font-size);
+    background: var(--block-background-color);
+    /* color: var(--secondary-color); */
+    color: var(--primary-dimmed-color);
+}
+
+.terminal pre code {
+    background: var(--block-background-color);
+    color: var(--secondary-color);
+}
+
+.hljs-keyword, .hljs-selector-tag, .hljs-built_in, .hljs-name, .hljs-tag {
+    color: var(--accent-color);
+}
+.hljs-string {
+    color: var(--primary-dimmed-color);
+}
+.hljs-comment {
+    color: var(--secondary-dimmed-color);
+    font-style: italic;
+    font-size: 0.9em;
+}
+.hljs-number {
+    color: var(--primary-dimmed-color);
+}
+
+.terminal strong > code, .terminal h2 > code , .terminal h3 > code {
+    background-color: transparent;
+    /* color: var(--font-color); */
+    color: var(--primary-dimmed-color);
+    text-shadow: none;
+}
+
+blockquote {
+    background-color: var(--invert-font-color);
+    padding: 1em 2em;
+    border-left: 2px solid var(--primary-dimmed-color);
+}
+
+blockquote::after {
+    content: "💡";
+    white-space: pre;
+    position: absolute;
+    top: 1em;
+    left: 5px;
+    line-height: var(--global-line-height);
+    color: #9ca2ab;
+}
+
+pre {
+    display: block;
+    word-break: break-word;
+    word-wrap: break-word;
+}
+
+.terminal h1 {
+    font-size: 2em;
+}
+
+.terminal h1, .terminal h2, .terminal h3, .terminal h4, .terminal h5, .terminal h6 {
+    text-shadow: 0 0 0px var(--font-color), 0 0 0px var(--font-color), 0 0 0px var(--font-color);
+}
+
+/* Lower max height or width for these images */
+div.badges a {
+    /* no underline */
+    text-decoration: none !important;
+}
+div.badges a > img {
+    width: auto;
 }
\ No newline at end of file
diff --git a/docs/md_v2/basic/browser-config.md b/docs/md_v2/basic/browser-config.md
deleted file mode 100644
index 7df4a97b..00000000
--- a/docs/md_v2/basic/browser-config.md
+++ /dev/null
@@ -1,208 +0,0 @@
-# Browser Configuration
-
-Crawl4AI supports multiple browser engines and offers extensive configuration options for browser behavior.
-
-## Browser Types
-
-Choose from three browser engines:
-
-```python
-# Chromium (default)
-async with AsyncWebCrawler(browser_type="chromium") as crawler:
-    result = await crawler.arun(url="https://example.com")
-
-# Firefox
-async with AsyncWebCrawler(browser_type="firefox") as crawler:
-    result = await crawler.arun(url="https://example.com")
-
-# WebKit
-async with AsyncWebCrawler(browser_type="webkit") as crawler:
-    result = await crawler.arun(url="https://example.com")
-```
-
-## Basic Configuration
-
-Common browser settings:
-
-```python
-async with AsyncWebCrawler(
-    headless=True,           # Run in headless mode (no GUI)
-    verbose=True,           # Enable detailed logging
-    sleep_on_close=False    # No delay when closing browser
-) as crawler:
-    result = await crawler.arun(url="https://example.com")
-```
-
-## Identity Management
-
-Control how your crawler appears to websites:
-
-```python
-# Custom user agent
-async with AsyncWebCrawler(
-    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-) as crawler:
-    result = await crawler.arun(url="https://example.com")
-
-# Custom headers
-headers = {
-    "Accept-Language": "en-US,en;q=0.9",
-    "Cache-Control": "no-cache"
-}
-async with AsyncWebCrawler(headers=headers) as crawler:
-    result = await crawler.arun(url="https://example.com")
-```
-
-## Screenshot Capabilities
-
-Capture page screenshots with enhanced error handling:
-
-```python
-result = await crawler.arun(
-    url="https://example.com",
-    screenshot=True,                # Enable screenshot
-    screenshot_wait_for=2.0        # Wait 2 seconds before capture
-)
-
-if result.screenshot:  # Base64 encoded image
-    import base64
-    with open("screenshot.png", "wb") as f:
-        f.write(base64.b64decode(result.screenshot))
-```
-
-## Timeouts and Waiting
-
-Control page loading behavior:
-
-```python
-result = await crawler.arun(
-    url="https://example.com",
-    page_timeout=60000,              # Page load timeout (ms)
-    delay_before_return_html=2.0,    # Wait before content capture
-    wait_for="css:.dynamic-content"  # Wait for specific element
-)
-```
-
-## JavaScript Execution
-
-Execute custom JavaScript before crawling:
-
-```python
-# Single JavaScript command
-result = await crawler.arun(
-    url="https://example.com",
-    js_code="window.scrollTo(0, document.body.scrollHeight);"
-)
-
-# Multiple commands
-js_commands = [
-    "window.scrollTo(0, document.body.scrollHeight);",
-    "document.querySelector('.load-more').click();"
-]
-result = await crawler.arun(
-    url="https://example.com",
-    js_code=js_commands
-)
-```
-
-## Proxy Configuration
-
-Use proxies for enhanced access:
-
-```python
-# Simple proxy
-async with AsyncWebCrawler(
-    proxy="http://proxy.example.com:8080"
-) as crawler:
-    result = await crawler.arun(url="https://example.com")
-
-# Proxy with authentication
-proxy_config = {
-    "server": "http://proxy.example.com:8080",
-    "username": "user",
-    "password": "pass"
-}
-async with AsyncWebCrawler(proxy_config=proxy_config) as crawler:
-    result = await crawler.arun(url="https://example.com")
-```
-
-## Anti-Detection Features
-
-Enable stealth features to avoid bot detection:
-
-```python
-result = await crawler.arun(
-    url="https://example.com",
-    simulate_user=True,        # Simulate human behavior
-    override_navigator=True,   # Mask automation signals
-    magic=True               # Enable all anti-detection features
-)
-```
-
-## Handling Dynamic Content
-
-Configure browser to handle dynamic content:
-
-```python
-# Wait for dynamic content
-result = await crawler.arun(
-    url="https://example.com",
-    wait_for="js:() => document.querySelector('.content').children.length > 10",
-    process_iframes=True     # Process iframe content
-)
-
-# Handle lazy-loaded images
-result = await crawler.arun(
-    url="https://example.com",
-    js_code="window.scrollTo(0, document.body.scrollHeight);",
-    delay_before_return_html=2.0  # Wait for images to load
-)
-```
-
-## Comprehensive Example
-
-Here's how to combine various browser configurations:
-
-```python
-async def crawl_with_advanced_config(url: str):
-    async with AsyncWebCrawler(
-        # Browser setup
-        browser_type="chromium",
-        headless=True,
-        verbose=True,
-        
-        # Identity
-        user_agent="Custom User Agent",
-        headers={"Accept-Language": "en-US"},
-        
-        # Proxy setup
-        proxy="http://proxy.example.com:8080"
-    ) as crawler:
-        result = await crawler.arun(
-            url=url,
-            # Content handling
-            process_iframes=True,
-            screenshot=True,
-            
-            # Timing
-            page_timeout=60000,
-            delay_before_return_html=2.0,
-            
-            # Anti-detection
-            magic=True,
-            simulate_user=True,
-            
-            # Dynamic content
-            js_code=[
-                "window.scrollTo(0, document.body.scrollHeight);",
-                "document.querySelector('.load-more')?.click();"
-            ],
-            wait_for="css:.dynamic-content"
-        )
-        
-        return {
-            "content": result.markdown,
-            "screenshot": result.screenshot,
-            "success": result.success
-        }
-```
\ No newline at end of file
diff --git a/docs/md_v2/basic/content-selection.md b/docs/md_v2/basic/content-selection.md
deleted file mode 100644
index ec838f2d..00000000
--- a/docs/md_v2/basic/content-selection.md
+++ /dev/null
@@ -1,135 +0,0 @@
-### Content Selection
-
-Crawl4AI provides multiple ways to select and filter specific content from webpages. Learn how to precisely target the content you need.
-
-#### CSS Selectors
-
-Extract specific content using a `CrawlerRunConfig` with CSS selectors:
-
-```python
-from crawl4ai.async_configs import CrawlerRunConfig
-
-config = CrawlerRunConfig(css_selector=".main-article")  # Target main article content
-result = await crawler.arun(url="https://crawl4ai.com", config=config)
-
-config = CrawlerRunConfig(css_selector="article h1, article .content")  # Target heading and content
-result = await crawler.arun(url="https://crawl4ai.com", config=config)
-```
-
-#### Content Filtering
-
-Control content inclusion or exclusion with `CrawlerRunConfig`:
-
-```python
-config = CrawlerRunConfig(
-    word_count_threshold=10,        # Minimum words per block
-    excluded_tags=['form', 'header', 'footer', 'nav'],  # Excluded tags
-    exclude_external_links=True,    # Remove external links
-    exclude_social_media_links=True,  # Remove social media links
-    exclude_external_images=True   # Remove external images
-)
-
-result = await crawler.arun(url="https://crawl4ai.com", config=config)
-```
-
-#### Iframe Content
-
-Process iframe content by enabling specific options in `CrawlerRunConfig`:
-
-```python
-config = CrawlerRunConfig(
-    process_iframes=True,          # Extract iframe content
-    remove_overlay_elements=True  # Remove popups/modals that might block iframes
-)
-
-result = await crawler.arun(url="https://crawl4ai.com", config=config)
-```
-
-#### Structured Content Selection Using LLMs
-
-Leverage LLMs for intelligent content extraction:
-
-```python
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
-from pydantic import BaseModel
-from typing import List
-
-class ArticleContent(BaseModel):
-    title: str
-    main_points: List[str]
-    conclusion: str
-
-strategy = LLMExtractionStrategy(
-    provider="ollama/nemotron",
-    schema=ArticleContent.schema(),
-    instruction="Extract the main article title, key points, and conclusion"
-)
-
-config = CrawlerRunConfig(extraction_strategy=strategy)
-
-result = await crawler.arun(url="https://crawl4ai.com", config=config)
-article = json.loads(result.extracted_content)
-```
-
-#### Pattern-Based Selection
-
-Extract content matching repetitive patterns:
-
-```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-schema = {
-    "name": "News Articles",
-    "baseSelector": "article.news-item",
-    "fields": [
-        {"name": "headline", "selector": "h2", "type": "text"},
-        {"name": "summary", "selector": ".summary", "type": "text"},
-        {"name": "category", "selector": ".category", "type": "text"},
-        {
-            "name": "metadata",
-            "type": "nested",
-            "fields": [
-                {"name": "author", "selector": ".author", "type": "text"},
-                {"name": "date", "selector": ".date", "type": "text"}
-            ]
-        }
-    ]
-}
-
-strategy = JsonCssExtractionStrategy(schema)
-config = CrawlerRunConfig(extraction_strategy=strategy)
-
-result = await crawler.arun(url="https://crawl4ai.com", config=config)
-articles = json.loads(result.extracted_content)
-```
-
-#### Comprehensive Example
-
-Combine different selection methods using `CrawlerRunConfig`:
-
-```python
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
-
-async def extract_article_content(url: str):
-    # Define structured extraction
-    article_schema = {
-        "name": "Article",
-        "baseSelector": "article.main",
-        "fields": [
-            {"name": "title", "selector": "h1", "type": "text"},
-            {"name": "content", "selector": ".content", "type": "text"}
-        ]
-    }
-
-    # Define configuration
-    config = CrawlerRunConfig(
-        extraction_strategy=JsonCssExtractionStrategy(article_schema),
-        word_count_threshold=10,
-        excluded_tags=['nav', 'footer'],
-        exclude_external_links=True
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(url=url, config=config)
-        return json.loads(result.extracted_content)
-```
diff --git a/docs/md_v2/basic/content_filtering.md b/docs/md_v2/basic/content_filtering.md
deleted file mode 100644
index 14f48ec6..00000000
--- a/docs/md_v2/basic/content_filtering.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# Content Filtering in Crawl4AI
-
-This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies.
-
-## Relevance Content Filter
-
-The `RelevanceContentFilter` is an abstract class providing a common interface for content filtering strategies. Specific algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
-
-## Pruning Content Filter
-
-The `PruningContentFilter` removes less relevant nodes based on metrics like text density, link density, and tag importance. Nodes that fall below a defined threshold are pruned, leaving only high-value content.
-
-### Usage
-
-```python
-from crawl4ai.async_configs import CrawlerRunConfig
-from crawl4ai.content_filter_strategy import PruningContentFilter
-
-config = CrawlerRunConfig(
-    content_filter=PruningContentFilter(
-        min_word_threshold=5,
-        threshold_type='dynamic',
-        threshold=0.45
-    ),
-    fit_markdown=True  # Activates markdown fitting
-)
-
-result = await crawler.arun(url="https://example.com", config=config)
-
-if result.success:
-    print(f"Cleaned Markdown:\n{result.fit_markdown}")
-```
-
-### Parameters
-
-- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned.
-- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated:
-  - `'fixed'`: Uses a constant threshold value for all nodes.
-  - `'dynamic'`: Adjusts thresholds based on node properties (e.g., tag importance, text/link ratios).
-- **`threshold`**: (Optional, default 0.48) Base threshold for pruning:
-  - Fixed: Nodes scoring below this value are removed.
-  - Dynamic: This value adjusts based on node characteristics.
-
-### How It Works
-
-The algorithm evaluates each node using:
-- **Text density**: Ratio of text to overall content.
-- **Link density**: Proportion of text within links.
-- **Tag importance**: Weights based on HTML tag type (e.g., `<article>`, `<p>`, `<div>`).
-- **Content quality**: Metrics like text length and structural importance.
-
-## BM25 Algorithm
-
-The `BM25ContentFilter` uses the BM25 algorithm to rank and extract text chunks based on relevance to a search query or page metadata.
-
-### Usage
-
-```python
-from crawl4ai.async_configs import CrawlerRunConfig
-from crawl4ai.content_filter_strategy import BM25ContentFilter
-
-config = CrawlerRunConfig(
-    content_filter=BM25ContentFilter(user_query="fruit nutrition health"),
-    fit_markdown=True  # Activates markdown fitting
-)
-
-result = await crawler.arun(url="https://example.com", config=config)
-
-if result.success:
-    print(f"Filtered Content:\n{result.extracted_content}")
-    print(f"\nFiltered Markdown:\n{result.fit_markdown}")
-    print(f"\nFiltered HTML:\n{result.fit_html}")
-else:
-    print("Error:", result.error_message)
-```
-
-### Parameters
-
-- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts metadata (title, description, keywords) and uses it as the query.
-- **`bm25_threshold`**: (Optional, default 1.0) Threshold controlling relevance:
-  - Higher values return stricter, more relevant results.
-  - Lower values include more lenient filtering.
-
diff --git a/docs/md_v2/basic/installation.md b/docs/md_v2/basic/installation.md
index 10e312f7..72391438 100644
--- a/docs/md_v2/basic/installation.md
+++ b/docs/md_v2/basic/installation.md
@@ -134,4 +134,4 @@ This script should successfully crawl the example website and print the first 50
 
 If you encounter any issues during installation or usage, please check the [documentation](https://docs.crawl4ai.com/) or raise an issue on the [GitHub repository](https://github.com/unclecode/crawl4ai/issues).
 
-Happy crawling! 🕷️🤖
\ No newline at end of file
+Happy crawling! 🕷️🤖
diff --git a/docs/md_v2/basic/output-formats.md b/docs/md_v2/basic/output-formats.md
deleted file mode 100644
index 3686c23c..00000000
--- a/docs/md_v2/basic/output-formats.md
+++ /dev/null
@@ -1,102 +0,0 @@
-# Output Formats
-
-Crawl4AI provides multiple output formats to suit different needs, ranging from raw HTML to structured data using LLM or pattern-based extraction, and versatile markdown outputs.
-
-## Basic Formats
-
-```python
-result = await crawler.arun(url="https://example.com")
-
-# Access different formats
-raw_html = result.html                # Original HTML
-clean_html = result.cleaned_html      # Sanitized HTML
-markdown_v2 = result.markdown_v2      # Detailed markdown generation results
-fit_md = result.markdown_v2.fit_markdown  # Most relevant content in markdown
-```
-
-> **Note**: The `markdown_v2` property will soon be replaced by `markdown`. It is recommended to start transitioning to using `markdown` for new implementations.
-
-## Raw HTML
-
-Original, unmodified HTML from the webpage. Useful when you need to:
-- Preserve the exact page structure.
-- Process HTML with your own tools.
-- Debug page issues.
-
-```python
-result = await crawler.arun(url="https://example.com")
-print(result.html)  # Complete HTML including headers, scripts, etc.
-```
-
-## Cleaned HTML
-
-Sanitized HTML with unnecessary elements removed. Automatically:
-- Removes scripts and styles.
-- Cleans up formatting.
-- Preserves semantic structure.
-
-```python
-config = CrawlerRunConfig(
-    excluded_tags=['form', 'header', 'footer'],  # Additional tags to remove
-    keep_data_attributes=False  # Remove data-* attributes
-)
-result = await crawler.arun(url="https://example.com", config=config)
-print(result.cleaned_html)
-```
-
-## Standard Markdown
-
-HTML converted to clean markdown format. This output is useful for:
-- Content analysis.
-- Documentation.
-- Readability.
-
-```python
-config = CrawlerRunConfig(
-    markdown_generator=DefaultMarkdownGenerator(
-        options={"include_links": True}  # Include links in markdown
-    )
-)
-result = await crawler.arun(url="https://example.com", config=config)
-print(result.markdown_v2.raw_markdown)  # Standard markdown with links
-```
-
-## Fit Markdown
-
-Extract and convert only the most relevant content into markdown format. Best suited for:
-- Article extraction.
-- Focusing on the main content.
-- Removing boilerplate.
-
-To generate `fit_markdown`, use a content filter like `PruningContentFilter`:
-
-```python
-from crawl4ai.content_filter_strategy import PruningContentFilter
-
-config = CrawlerRunConfig(
-    content_filter=PruningContentFilter(
-        threshold=0.7,
-        threshold_type="dynamic",
-        min_word_threshold=100
-    )
-)
-result = await crawler.arun(url="https://example.com", config=config)
-print(result.markdown_v2.fit_markdown)  # Extracted main content in markdown
-```
-
-## Markdown with Citations
-
-Generate markdown that includes citations for links. This format is ideal for:
-- Creating structured documentation.
-- Including references for extracted content.
-
-```python
-config = CrawlerRunConfig(
-    markdown_generator=DefaultMarkdownGenerator(
-        options={"citations": True}  # Enable citations
-    )
-)
-result = await crawler.arun(url="https://example.com", config=config)
-print(result.markdown_v2.markdown_with_citations)
-print(result.markdown_v2.references_markdown)  # Citations section
-```
diff --git a/docs/md_v2/basic/page-interaction.md b/docs/md_v2/basic/page-interaction.md
deleted file mode 100644
index 07a2c9cd..00000000
--- a/docs/md_v2/basic/page-interaction.md
+++ /dev/null
@@ -1,190 +0,0 @@
-# Page Interaction
-
-Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events.
-
-## JavaScript Execution
-
-### Basic Execution
-
-```python
-from crawl4ai.async_configs import CrawlerRunConfig
-
-# Single JavaScript command
-config = CrawlerRunConfig(
-    js_code="window.scrollTo(0, document.body.scrollHeight);"
-)
-result = await crawler.arun(url="https://example.com", config=config)
-
-# Multiple commands
-js_commands = [
-    "window.scrollTo(0, document.body.scrollHeight);",
-    "document.querySelector('.load-more').click();",
-    "document.querySelector('#consent-button').click();"
-]
-config = CrawlerRunConfig(js_code=js_commands)
-result = await crawler.arun(url="https://example.com", config=config)
-```
-
-## Wait Conditions
-
-### CSS-Based Waiting
-
-Wait for elements to appear:
-
-```python
-config = CrawlerRunConfig(wait_for="css:.dynamic-content")  # Wait for element with class 'dynamic-content'
-result = await crawler.arun(url="https://example.com", config=config)
-```
-
-### JavaScript-Based Waiting
-
-Wait for custom conditions:
-
-```python
-# Wait for number of elements
-wait_condition = """() => {
-    return document.querySelectorAll('.item').length > 10;
-}"""
-
-config = CrawlerRunConfig(wait_for=f"js:{wait_condition}")
-result = await crawler.arun(url="https://example.com", config=config)
-
-# Wait for dynamic content to load
-wait_for_content = """() => {
-    const content = document.querySelector('.content');
-    return content && content.innerText.length > 100;
-}"""
-
-config = CrawlerRunConfig(wait_for=f"js:{wait_for_content}")
-result = await crawler.arun(url="https://example.com", config=config)
-```
-
-## Handling Dynamic Content
-
-### Load More Content
-
-Handle infinite scroll or load more buttons:
-
-```python
-config = CrawlerRunConfig(
-    js_code=[
-        "window.scrollTo(0, document.body.scrollHeight);",  # Scroll to bottom
-        "const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();"  # Click load more
-    ],
-    wait_for="js:() => document.querySelectorAll('.item').length > previousCount"  # Wait for new content
-)
-result = await crawler.arun(url="https://example.com", config=config)
-```
-
-### Form Interaction
-
-Handle forms and inputs:
-
-```python
-js_form_interaction = """
-    document.querySelector('#search').value = 'search term';  // Fill form fields
-    document.querySelector('form').submit();                 // Submit form
-"""
-
-config = CrawlerRunConfig(
-    js_code=js_form_interaction,
-    wait_for="css:.results"  # Wait for results to load
-)
-result = await crawler.arun(url="https://example.com", config=config)
-```
-
-## Timing Control
-
-### Delays and Timeouts
-
-Control timing of interactions:
-
-```python
-config = CrawlerRunConfig(
-    page_timeout=60000,              # Page load timeout (ms)
-    delay_before_return_html=2.0     # Wait before capturing content
-)
-result = await crawler.arun(url="https://example.com", config=config)
-```
-
-## Complex Interactions Example
-
-Here's an example of handling a dynamic page with multiple interactions:
-
-```python
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
-
-async def crawl_dynamic_content():
-    async with AsyncWebCrawler() as crawler:
-        # Initial page load
-        config = CrawlerRunConfig(
-            js_code="document.querySelector('.cookie-accept')?.click();",  # Handle cookie consent
-            wait_for="css:.main-content"
-        )
-        result = await crawler.arun(url="https://example.com", config=config)
-
-        # Load more content
-        session_id = "dynamic_session"  # Keep session for multiple interactions
-        
-        for page in range(3):  # Load 3 pages of content
-            config = CrawlerRunConfig(
-                session_id=session_id,
-                js_code=[
-                    "window.scrollTo(0, document.body.scrollHeight);",  # Scroll to bottom
-                    "window.previousCount = document.querySelectorAll('.item').length;",  # Store item count
-                    "document.querySelector('.load-more')?.click();"   # Click load more
-                ],
-                wait_for="""() => {
-                    const currentCount = document.querySelectorAll('.item').length;
-                    return currentCount > window.previousCount;
-                }""",
-                js_only=(page > 0)  # Execute JS without reloading page for subsequent interactions
-            )
-            result = await crawler.arun(url="https://example.com", config=config)
-            print(f"Page {page + 1} items:", len(result.cleaned_html))
-
-        # Clean up session
-        await crawler.crawler_strategy.kill_session(session_id)
-```
-
-## Using with Extraction Strategies
-
-Combine page interaction with structured extraction:
-
-```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
-from crawl4ai.async_configs import CrawlerRunConfig
-
-# Pattern-based extraction after interaction
-schema = {
-    "name": "Dynamic Items",
-    "baseSelector": ".item",
-    "fields": [
-        {"name": "title", "selector": "h2", "type": "text"},
-        {"name": "description", "selector": ".desc", "type": "text"}
-    ]
-}
-
-config = CrawlerRunConfig(
-    js_code="window.scrollTo(0, document.body.scrollHeight);",
-    wait_for="css:.item:nth-child(10)",  # Wait for 10 items
-    extraction_strategy=JsonCssExtractionStrategy(schema)
-)
-result = await crawler.arun(url="https://example.com", config=config)
-
-# Or use LLM to analyze dynamic content
-class ContentAnalysis(BaseModel):
-    topics: List[str]
-    summary: str
-
-config = CrawlerRunConfig(
-    js_code="document.querySelector('.show-more').click();",
-    wait_for="css:.full-content",
-    extraction_strategy=LLMExtractionStrategy(
-        provider="ollama/nemotron",
-        schema=ContentAnalysis.schema(),
-        instruction="Analyze the full content"
-    )
-)
-result = await crawler.arun(url="https://example.com", config=config)
-```
diff --git a/docs/md_v2/basic/quickstart.md b/docs/md_v2/basic/quickstart.md
deleted file mode 100644
index ffc35986..00000000
--- a/docs/md_v2/basic/quickstart.md
+++ /dev/null
@@ -1,172 +0,0 @@
-# Quick Start Guide 🚀
-
-Welcome to the Crawl4AI Quickstart Guide! In this tutorial, we'll walk you through the basic usage of Crawl4AI, covering everything from initial setup to advanced features like chunking and extraction strategies, using asynchronous programming. Let's dive in! 🌟
-
----
-
-## Getting Started 🛠️
-
-Set up your environment with `BrowserConfig` and create an `AsyncWebCrawler` instance.
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler
-from crawl4ai.async_configs import BrowserConfig
-
-async def main():
-    browser_config = BrowserConfig(verbose=True)
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        # Add your crawling logic here
-        pass
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-### Basic Usage
-
-Provide a URL and let Crawl4AI do the work!
-
-```python
-from crawl4ai.async_configs import CrawlerRunConfig
-
-async def main():
-    browser_config = BrowserConfig(verbose=True)
-    crawl_config = CrawlerRunConfig(url="https://www.nbcnews.com/business")
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(config=crawl_config)
-        print(f"Basic crawl result: {result.markdown[:500]}")  # Print first 500 characters
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-### Taking Screenshots 📸
-
-Capture and save webpage screenshots with `CrawlerRunConfig`:
-
-```python
-from crawl4ai.async_configs import CacheMode
-
-async def capture_and_save_screenshot(url: str, output_path: str):
-    browser_config = BrowserConfig(verbose=True)
-    crawl_config = CrawlerRunConfig(
-        url=url,
-        screenshot=True,
-        cache_mode=CacheMode.BYPASS
-    )
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(config=crawl_config)
-        
-        if result.success and result.screenshot:
-            import base64
-            screenshot_data = base64.b64decode(result.screenshot)
-            with open(output_path, 'wb') as f:
-                f.write(screenshot_data)
-            print(f"Screenshot saved successfully to {output_path}")
-        else:
-            print("Failed to capture screenshot")
-```
-
----
-
-### Browser Selection 🌐
-
-Choose from multiple browser engines using `BrowserConfig`:
-
-```python
-from crawl4ai.async_configs import BrowserConfig
-
-# Use Firefox
-firefox_config = BrowserConfig(browser_type="firefox", verbose=True, headless=True)
-async with AsyncWebCrawler(config=firefox_config) as crawler:
-    result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com"))
-
-# Use WebKit
-webkit_config = BrowserConfig(browser_type="webkit", verbose=True, headless=True)
-async with AsyncWebCrawler(config=webkit_config) as crawler:
-    result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com"))
-
-# Use Chromium (default)
-chromium_config = BrowserConfig(verbose=True, headless=True)
-async with AsyncWebCrawler(config=chromium_config) as crawler:
-    result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com"))
-```
-
----
-
-### User Simulation 🎭
-
-Simulate real user behavior to bypass detection:
-
-```python
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
-
-browser_config = BrowserConfig(verbose=True, headless=True)
-crawl_config = CrawlerRunConfig(
-    url="YOUR-URL-HERE",
-    cache_mode=CacheMode.BYPASS,
-    simulate_user=True,          # Random mouse movements and clicks
-    override_navigator=True      # Makes the browser appear like a real user
-)
-async with AsyncWebCrawler(config=browser_config) as crawler:
-    result = await crawler.arun(config=crawl_config)
-```
-
----
-
-### Understanding Parameters 🧠
-
-Explore caching and forcing fresh crawls:
-
-```python
-async def main():
-    browser_config = BrowserConfig(verbose=True)
-
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        # First crawl (uses cache)
-        result1 = await crawler.arun(config=CrawlerRunConfig(url="https://www.nbcnews.com/business"))
-        print(f"First crawl result: {result1.markdown[:100]}...")
-
-        # Force fresh crawl
-        result2 = await crawler.arun(
-            config=CrawlerRunConfig(url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS)
-        )
-        print(f"Second crawl result: {result2.markdown[:100]}...")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-### Adding a Chunking Strategy 🧩
-
-Split content into chunks using `RegexChunking`:
-
-```python
-from crawl4ai.chunking_strategy import RegexChunking
-
-async def main():
-    browser_config = BrowserConfig(verbose=True)
-    crawl_config = CrawlerRunConfig(
-        url="https://www.nbcnews.com/business",
-        chunking_strategy=RegexChunking(patterns=["\n\n"])
-    )
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(config=crawl_config)
-        print(f"RegexChunking result: {result.extracted_content[:200]}...")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-### Advanced Features and Configurations
-
-For advanced examples (LLM strategies, knowledge graphs, pagination handling), ensure all code aligns with the `BrowserConfig` and `CrawlerRunConfig` pattern shown above.
diff --git a/docs/md_v2/blog/articles/dockerize_hooks.md b/docs/md_v2/blog/articles/dockerize_hooks.md
index 965388ee..4866c224 100644
--- a/docs/md_v2/blog/articles/dockerize_hooks.md
+++ b/docs/md_v2/blog/articles/dockerize_hooks.md
@@ -34,9 +34,9 @@ sequenceDiagram
 
 **Benefits for Developers and Users**
 
-1. **Fine-Grained Control**: Instead of predefining all logic upfront, you can dynamically guide the crawler in response to actual data and conditions encountered mid-crawl.
-2. **Real-Time Insights**: Monitor progress, errors, or network bottlenecks as they happen, without waiting for the entire crawl to finish.
-3. **Enhanced Collaboration**: Different team members or automated systems can watch the same crawl events and provide input, making the crawling process more adaptive and intelligent.
+1. **Fine-Grained Control**: Instead of predefining all logic upfront, you can dynamically guide the crawler in response to actual data and conditions encountered mid-crawl.
+2. **Real-Time Insights**: Monitor progress, errors, or network bottlenecks as they happen, without waiting for the entire crawl to finish.
+3. **Enhanced Collaboration**: Different team members or automated systems can watch the same crawl events and provide input, making the crawling process more adaptive and intelligent.
 
 **Next Steps**
 
diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md
index f7c8494d..8b6ae74f 100644
--- a/docs/md_v2/blog/index.md
+++ b/docs/md_v2/blog/index.md
@@ -33,12 +33,6 @@ Introduced significant improvements to content filtering, multi-threaded environ
 
 Curious about how Crawl4AI has evolved? Check out our [complete changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) for a detailed history of all versions and updates.
 
-## Categories
-
-- [Technical Deep Dives](/blog/technical) - Coming soon
-- [Tutorials & Guides](/blog/tutorials) - Coming soon
-- [Community Updates](/blog/community) - Coming soon
-
 ## Stay Updated
 
 - Star us on [GitHub](https://github.com/unclecode/crawl4ai)
diff --git a/docs/md_v2/blog/releases/0.4.2.md b/docs/md_v2/blog/releases/0.4.2.md
index 6f8f39e9..1386979a 100644
--- a/docs/md_v2/blog/releases/0.4.2.md
+++ b/docs/md_v2/blog/releases/0.4.2.md
@@ -72,9 +72,9 @@ Two big upgrades here:
 
 ### 🔠 **Use Cases You’ll Love**
 
-1. **Authenticated Crawls**: Login once, export your storage state, and reuse it across multiple requests without the headache.
-2. **Long-page Screenshots**: Perfect for blogs, e-commerce pages, or any endless-scroll website.
-3. **PDF Export**: Create professional-looking page PDFs in seconds.
+1. **Authenticated Crawls**: Login once, export your storage state, and reuse it across multiple requests without the headache.
+2. **Long-page Screenshots**: Perfect for blogs, e-commerce pages, or any endless-scroll website.
+3. **PDF Export**: Create professional-looking page PDFs in seconds.
 
 ---
 
diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md
new file mode 100644
index 00000000..8d916738
--- /dev/null
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -0,0 +1,371 @@
+# Browser & Crawler Configuration (Quick Overview)
+
+Crawl4AI’s flexibility stems from two key classes:
+
+1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).  
+2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).
+
+In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
+
+---
+
+## 1. BrowserConfig Essentials
+
+```python
+class BrowserConfig:
+    def __init__(
+        browser_type="chromium",
+        headless=True,
+        proxy_config=None,
+        viewport_width=1080,
+        viewport_height=600,
+        verbose=True,
+        use_persistent_context=False,
+        user_data_dir=None,
+        cookies=None,
+        headers=None,
+        user_agent=None,
+        text_mode=False,
+        light_mode=False,
+        extra_args=None,
+        # ... other advanced parameters omitted here
+    ):
+        ...
+```
+
+### Key Fields to Note
+
+
+
+1. **`browser_type`**  
+- Options: `"chromium"`, `"firefox"`, or `"webkit"`.  
+- Defaults to `"chromium"`.  
+- If you need a different engine, specify it here.
+
+2. **`headless`**  
+   - `True`: Runs the browser in headless mode (invisible browser).  
+   - `False`: Runs the browser in visible mode, which helps with debugging.
+
+3. **`proxy_config`**  
+   - A dictionary with fields like:  
+```json
+{
+    "server": "http://proxy.example.com:8080", 
+    "username": "...", 
+    "password": "..."
+}
+```
+   - Leave as `None` if a proxy is not required.
+
+4. **`viewport_width` & `viewport_height`**:  
+   - The initial window size.  
+   - Some sites behave differently with smaller or bigger viewports.
+
+5. **`verbose`**:  
+   - If `True`, prints extra logs.  
+   - Handy for debugging.
+
+6. **`use_persistent_context`**:  
+   - If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs.  
+   - Typically also set `user_data_dir` to point to a folder.
+
+7. **`cookies`** & **`headers`**:  
+   - If you want to start with specific cookies or add universal HTTP headers, set them here.  
+   - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
+
+8. **`user_agent`**:  
+   - Custom User-Agent string. If `None`, a default is used.  
+   - You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection).
+
+9. **`text_mode`** & **`light_mode`**:  
+   - `text_mode=True` disables images, possibly speeding up text-only crawls.  
+   - `light_mode=True` turns off certain background features for performance.  
+
+10. **`extra_args`**:  
+    - Additional flags for the underlying browser.  
+    - E.g. `["--disable-extensions"]`.
+
+### Helper Methods
+
+Both configuration classes provide a `clone()` method to create modified copies:
+
+```python
+# Create a base browser config
+base_browser = BrowserConfig(
+    browser_type="chromium",
+    headless=True,
+    text_mode=True
+)
+
+# Create a visible browser config for debugging
+debug_browser = base_browser.clone(
+    headless=False,
+    verbose=True
+)
+```
+
+**Minimal Example**:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+browser_conf = BrowserConfig(
+    browser_type="firefox",
+    headless=False,
+    text_mode=True
+)
+
+async with AsyncWebCrawler(config=browser_conf) as crawler:
+    result = await crawler.arun("https://example.com")
+    print(result.markdown[:300])
+```
+
+---
+
+## 2. CrawlerRunConfig Essentials
+
+```python
+class CrawlerRunConfig:
+    def __init__(
+        word_count_threshold=200,
+        extraction_strategy=None,
+        markdown_generator=None,
+        cache_mode=None,
+        js_code=None,
+        wait_for=None,
+        screenshot=False,
+        pdf=False,
+        enable_rate_limiting=False,
+        rate_limit_config=None,
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=20,
+        display_mode=None,
+        verbose=True,
+        stream=False,  # Enable streaming for arun_many()
+        # ... other advanced parameters omitted
+    ):
+        ...
+```
+
+### Key Fields to Note
+
+1. **`word_count_threshold`**:  
+   - The minimum word count before a block is considered.  
+   - If your site has lots of short paragraphs or items, you can lower it.
+
+2. **`extraction_strategy`**:  
+   - Where you plug in JSON-based extraction (CSS, LLM, etc.).  
+   - If `None`, no structured extraction is done (only raw/cleaned HTML + markdown).
+
+3. **`markdown_generator`**:  
+   - E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done.  
+   - If `None`, a default approach is used.
+
+4. **`cache_mode`**:  
+   - Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).  
+   - If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`.
+
+5. **`js_code`**:  
+   - A string or list of JS strings to execute.  
+   - Great for “Load More” buttons or user interactions.  
+
+6. **`wait_for`**:  
+   - A CSS or JS expression to wait for before extracting content.  
+   - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
+
+7. **`screenshot`** & **`pdf`**:  
+   - If `True`, captures a screenshot or PDF after the page is fully loaded.  
+   - The results go to `result.screenshot` (base64) or `result.pdf` (bytes).
+
+8. **`verbose`**:  
+   - Logs additional runtime details.  
+   - Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`.
+
+9. **`enable_rate_limiting`**:  
+   - If `True`, enables rate limiting for batch processing.  
+   - Requires `rate_limit_config` to be set.
+
+10. **`rate_limit_config`**:  
+    - A `RateLimitConfig` object controlling rate limiting behavior.  
+    - See below for details.
+
+11. **`memory_threshold_percent`**:  
+    - The memory threshold (as a percentage) to monitor.  
+    - If exceeded, the crawler will pause or slow down.
+
+12. **`check_interval`**:  
+    - The interval (in seconds) to check system resources.  
+    - Affects how often memory and CPU usage are monitored.
+
+13. **`max_session_permit`**:  
+    - The maximum number of concurrent crawl sessions.  
+    - Helps prevent overwhelming the system.
+
+14. **`display_mode`**:  
+    - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).  
+    - Affects how much information is printed during the crawl.
+
+### Helper Methods
+
+The `clone()` method is particularly useful for creating variations of your crawler configuration:
+
+```python
+# Create a base configuration
+base_config = CrawlerRunConfig(
+    cache_mode=CacheMode.ENABLED,
+    word_count_threshold=200,
+    wait_until="networkidle"
+)
+
+# Create variations for different use cases
+stream_config = base_config.clone(
+    stream=True,  # Enable streaming mode
+    cache_mode=CacheMode.BYPASS
+)
+
+debug_config = base_config.clone(
+    page_timeout=120000,  # Longer timeout for debugging
+    verbose=True
+)
+```
+
+The `clone()` method:
+- Creates a new instance with all the same settings
+- Updates only the specified parameters
+- Leaves the original configuration unchanged
+- Perfect for creating variations without repeating all parameters
+
+### Rate Limiting & Resource Management
+
+For batch processing with `arun_many()`, you can enable intelligent rate limiting:
+
+```python
+from crawl4ai import RateLimitConfig
+    
+config = CrawlerRunConfig(
+    enable_rate_limiting=True,
+    rate_limit_config=RateLimitConfig(
+        base_delay=(1.0, 3.0),    # Random delay range
+        max_delay=60.0,           # Max delay after rate limits
+        max_retries=3,            # Retries before giving up
+        rate_limit_codes=[429, 503]  # Status codes to watch
+    ),
+    memory_threshold_percent=70.0,  # Memory threshold
+    check_interval=1.0,            # Resource check interval
+    max_session_permit=20,         # Max concurrent crawls
+    display_mode="DETAILED"        # Progress display mode
+)
+```
+
+This configuration:
+- Implements intelligent rate limiting per domain
+- Monitors system resources
+- Provides detailed progress information
+- Manages concurrent crawls efficiently
+
+**Minimal Example**:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+crawl_conf = CrawlerRunConfig(
+    js_code="document.querySelector('button#loadMore')?.click()",
+    wait_for="css:.loaded-content",
+    screenshot=True,
+    enable_rate_limiting=True,
+    rate_limit_config=RateLimitConfig(
+        base_delay=(1.0, 3.0),
+        max_delay=60.0,
+        max_retries=3,
+        rate_limit_codes=[429, 503]
+    ),
+    stream=True  # Enable streaming
+)
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(url="https://example.com", config=crawl_conf)
+    print(result.screenshot[:100])  # Base64-encoded PNG snippet
+```
+
+---
+
+## 3. Putting It All Together
+
+In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` depending on each call’s needs:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    # 1) Browser config: headless, bigger viewport, no proxy
+    browser_conf = BrowserConfig(
+        headless=True,
+        viewport_width=1280,
+        viewport_height=720
+    )
+
+    # 2) Example extraction strategy
+    schema = {
+        "name": "Articles",
+        "baseSelector": "div.article",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+        ]
+    }
+    extraction = JsonCssExtractionStrategy(schema)
+
+    # 3) Crawler run config: skip cache, use extraction
+    run_conf = CrawlerRunConfig(
+        extraction_strategy=extraction,
+        cache_mode=CacheMode.BYPASS,
+        enable_rate_limiting=True,
+        rate_limit_config=RateLimitConfig(
+            base_delay=(1.0, 3.0),
+            max_delay=60.0,
+            max_retries=3,
+            rate_limit_codes=[429, 503]
+        )
+    )
+
+    async with AsyncWebCrawler(config=browser_conf) as crawler:
+        # 4) Execute the crawl
+        result = await crawler.arun(url="https://example.com/news", config=run_conf)
+
+        if result.success:
+            print("Extracted content:", result.extracted_content)
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 4. Next Steps
+
+For a **detailed list** of available parameters (including advanced ones), see:
+
+- [BrowserConfig and CrawlerRunConfig Reference](../api/parameters.md)  
+
+You can explore topics like:
+
+- **Custom Hooks & Auth** (Inject JavaScript or handle login forms).  
+- **Session Management** (Re-use pages, preserve state across multiple calls).  
+- **Magic Mode** or **Identity-based Crawling** (Fight bot detection by simulating user behavior).  
+- **Advanced Caching** (Fine-tune read/write cache modes).  
+
+---
+
+## 5. Conclusion
+
+**BrowserConfig** and **CrawlerRunConfig** give you straightforward ways to define:
+
+- **Which** browser to launch, how it should run, and any proxy or user agent needs.  
+- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.
+
+Use them together for **clear, maintainable** code, and when you need more specialized behavior, check out the advanced parameters in the [reference docs](../api/parameters.md). Happy crawling!
\ No newline at end of file
diff --git a/docs/md_v2/basic/cache-modes.md b/docs/md_v2/core/cache-modes.md
similarity index 86%
rename from docs/md_v2/basic/cache-modes.md
rename to docs/md_v2/core/cache-modes.md
index 73460e57..b0aab78a 100644
--- a/docs/md_v2/basic/cache-modes.md
+++ b/docs/md_v2/core/cache-modes.md
@@ -49,7 +49,8 @@ from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 
 async def use_proxy():
-    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)  # Use CacheMode in CrawlerRunConfig
+    # Use CacheMode in CrawlerRunConfig
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)  
     async with AsyncWebCrawler(verbose=True) as crawler:
         result = await crawler.arun(
             url="https://www.nbcnews.com/business",
@@ -71,11 +72,4 @@ if __name__ == "__main__":
 | `bypass_cache=True`   | `cache_mode=CacheMode.BYPASS`  |
 | `disable_cache=True`  | `cache_mode=CacheMode.DISABLED`|
 | `no_cache_read=True`  | `cache_mode=CacheMode.WRITE_ONLY` |
-| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
-
-## Suppressing Deprecation Warnings
-If you need time to migrate, you can temporarily suppress deprecation warnings:
-```python
-# In your config.py
-SHOW_DEPRECATION_WARNINGS = False
-```
+| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
\ No newline at end of file
diff --git a/docs/md_v2/core/content-selection.md b/docs/md_v2/core/content-selection.md
new file mode 100644
index 00000000..5d46ef10
--- /dev/null
+++ b/docs/md_v2/core/content-selection.md
@@ -0,0 +1,419 @@
+# Content Selection
+
+Crawl4AI provides multiple ways to **select**, **filter**, and **refine** the content from your crawls. Whether you need to target a specific CSS region, exclude entire tags, filter out external links, or remove certain domains and images, **`CrawlerRunConfig`** offers a wide range of parameters.
+
+Below, we show how to configure these parameters and combine them for precise control.
+
+---
+
+## 1. CSS-Based Selection
+
+A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        # e.g., first 30 items from Hacker News
+        css_selector=".athing:nth-child(-n+30)"  
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com/newest", 
+            config=config
+        )
+        print("Partial HTML length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Result**: Only elements matching that selector remain in `result.cleaned_html`.
+
+---
+
+## 2. Content Filtering & Exclusions
+
+### 2.1 Basic Overview
+
+```python
+config = CrawlerRunConfig(
+    # Content thresholds
+    word_count_threshold=10,        # Minimum words per block
+
+    # Tag exclusions
+    excluded_tags=['form', 'header', 'footer', 'nav'],
+
+    # Link filtering
+    exclude_external_links=True,    
+    exclude_social_media_links=True,
+    # Block entire domains
+    exclude_domains=["adtrackers.com", "spammynews.org"],    
+    exclude_social_media_domains=["facebook.com", "twitter.com"],
+
+    # Media filtering
+    exclude_external_images=True
+)
+```
+
+**Explanation**:
+
+- **`word_count_threshold`**: Ignores text blocks under X words. Helps skip trivial blocks like short nav or disclaimers.  
+- **`excluded_tags`**: Removes entire tags (`<form>`, `<header>`, `<footer>`, etc.).  
+- **Link Filtering**:  
+  - `exclude_external_links`: Strips out external links and may remove them from `result.links`.  
+  - `exclude_social_media_links`: Removes links pointing to known social media domains.  
+  - `exclude_domains`: A custom list of domains to block if discovered in links.  
+  - `exclude_social_media_domains`: A curated list (override or add to it) for social media sites.  
+- **Media Filtering**:  
+  - `exclude_external_images`: Discards images not hosted on the same domain as the main page (or its subdomains).
+
+By default in case you set `exclude_social_media_links=True`, the following social media domains are excluded:
+```python
+[
+    'facebook.com',
+    'twitter.com',
+    'x.com',
+    'linkedin.com',
+    'instagram.com',
+    'pinterest.com',
+    'tiktok.com',
+    'snapchat.com',
+    'reddit.com',
+]
+```
+
+
+### 2.2 Example Usage
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    config = CrawlerRunConfig(
+        css_selector="main.content", 
+        word_count_threshold=10,
+        excluded_tags=["nav", "footer"],
+        exclude_external_links=True,
+        exclude_social_media_links=True,
+        exclude_domains=["ads.com", "spammytrackers.net"],
+        exclude_external_images=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://news.ycombinator.com", config=config)
+        print("Cleaned HTML length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Note**: If these parameters remove too much, reduce or disable them accordingly.
+
+---
+
+## 3. Handling Iframes
+
+Some sites embed content in `<iframe>` tags. If you want that inline:
+```python
+config = CrawlerRunConfig(
+    # Merge iframe content into the final output
+    process_iframes=True,    
+    remove_overlay_elements=True
+)
+```
+
+**Usage**:
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        process_iframes=True,
+        remove_overlay_elements=True
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.org/iframe-demo", 
+            config=config
+        )
+        print("Iframe-merged length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 4. Structured Extraction Examples
+
+You can combine content selection with a more advanced extraction strategy. For instance, a **CSS-based** or **LLM-based** extraction strategy can run on the filtered HTML.
+
+### 4.1 Pattern-Based with `JsonCssExtractionStrategy`
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    # Minimal schema for repeated items
+    schema = {
+        "name": "News Items",
+        "baseSelector": "tr.athing",
+        "fields": [
+            {"name": "title", "selector": "a.storylink", "type": "text"},
+            {
+                "name": "link", 
+                "selector": "a.storylink", 
+                "type": "attribute", 
+                "attribute": "href"
+            }
+        ]
+    }
+
+    config = CrawlerRunConfig(
+        # Content filtering
+        excluded_tags=["form", "header"],
+        exclude_domains=["adsite.com"],
+        
+        # CSS selection or entire page
+        css_selector="table.itemlist",
+
+        # No caching for demonstration
+        cache_mode=CacheMode.BYPASS,
+
+        # Extraction strategy
+        extraction_strategy=JsonCssExtractionStrategy(schema)
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com/newest", 
+            config=config
+        )
+        data = json.loads(result.extracted_content)
+        print("Sample extracted item:", data[:1])  # Show first item
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 4.2 LLM-Based Extraction
+
+```python
+import asyncio
+import json
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class ArticleData(BaseModel):
+    headline: str
+    summary: str
+
+async def main():
+    llm_strategy = LLMExtractionStrategy(
+        provider="openai/gpt-4",
+        api_token="sk-YOUR_API_KEY",
+        schema=ArticleData.schema(),
+        extraction_type="schema",
+        instruction="Extract 'headline' and a short 'summary' from the content."
+    )
+
+    config = CrawlerRunConfig(
+        exclude_external_links=True,
+        word_count_threshold=20,
+        extraction_strategy=llm_strategy
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://news.ycombinator.com", config=config)
+        article = json.loads(result.extracted_content)
+        print(article)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+Here, the crawler:
+
+- Filters out external links (`exclude_external_links=True`).  
+- Ignores very short text blocks (`word_count_threshold=20`).  
+- Passes the final HTML to your LLM strategy for an AI-driven parse.
+
+---
+
+## 5. Comprehensive Example
+
+Below is a short function that unifies **CSS selection**, **exclusion** logic, and a pattern-based extraction, demonstrating how you can fine-tune your final data:
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def extract_main_articles(url: str):
+    schema = {
+        "name": "ArticleBlock",
+        "baseSelector": "div.article-block",
+        "fields": [
+            {"name": "headline", "selector": "h2", "type": "text"},
+            {"name": "summary", "selector": ".summary", "type": "text"},
+            {
+                "name": "metadata",
+                "type": "nested",
+                "fields": [
+                    {"name": "author", "selector": ".author", "type": "text"},
+                    {"name": "date", "selector": ".date", "type": "text"}
+                ]
+            }
+        ]
+    }
+
+    config = CrawlerRunConfig(
+        # Keep only #main-content
+        css_selector="#main-content",
+        
+        # Filtering
+        word_count_threshold=10,
+        excluded_tags=["nav", "footer"],  
+        exclude_external_links=True,
+        exclude_domains=["somebadsite.com"],
+        exclude_external_images=True,
+
+        # Extraction
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=url, config=config)
+        if not result.success:
+            print(f"Error: {result.error_message}")
+            return None
+        return json.loads(result.extracted_content)
+
+async def main():
+    articles = await extract_main_articles("https://news.ycombinator.com/newest")
+    if articles:
+        print("Extracted Articles:", articles[:2])  # Show first 2
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Why This Works**:
+- **CSS** scoping with `#main-content`.  
+- Multiple **exclude_** parameters to remove domains, external images, etc.  
+- A **JsonCssExtractionStrategy** to parse repeated article blocks.
+
+---
+
+## 6. Scraping Modes
+
+Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
+
+async def main():
+    config = CrawlerRunConfig(
+        scraping_strategy=LXMLWebScrapingStrategy()  # Faster alternative to default BeautifulSoup
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com", 
+            config=config
+        )
+```
+
+You can also create your own custom scraping strategy by inheriting from `ContentScrapingStrategy`. The strategy must return a `ScrapingResult` object with the following structure:
+
+```python
+from crawl4ai import ContentScrapingStrategy, ScrapingResult, MediaItem, Media, Link, Links
+
+class CustomScrapingStrategy(ContentScrapingStrategy):
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # Implement your custom scraping logic here
+        return ScrapingResult(
+            cleaned_html="<html>...</html>",  # Cleaned HTML content
+            success=True,                     # Whether scraping was successful
+            media=Media(
+                images=[                      # List of images found
+                    MediaItem(
+                        src="https://example.com/image.jpg",
+                        alt="Image description",
+                        desc="Surrounding text",
+                        score=1,
+                        type="image",
+                        group_id=1,
+                        format="jpg",
+                        width=800
+                    )
+                ],
+                videos=[],                    # List of videos (same structure as images)
+                audios=[]                     # List of audio files (same structure as images)
+            ),
+            links=Links(
+                internal=[                    # List of internal links
+                    Link(
+                        href="https://example.com/page",
+                        text="Link text",
+                        title="Link title",
+                        base_domain="example.com"
+                    )
+                ],
+                external=[]                   # List of external links (same structure)
+            ),
+            metadata={                        # Additional metadata
+                "title": "Page Title",
+                "description": "Page description"
+            }
+        )
+
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # For simple cases, you can use the sync version
+        return await asyncio.to_thread(self.scrap, url, html, **kwargs)
+```
+
+### Performance Considerations
+
+The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
+
+1. LXML strategy is currently experimental
+2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
+3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
+
+Choose LXML strategy when:
+- Processing large HTML documents (recommended for >100KB)
+- Performance is critical
+- Working with well-formed HTML
+
+Stick to BeautifulSoup strategy (default) when:
+- Maximum compatibility is needed
+- Working with malformed HTML
+- Exact parsing behavior is critical
+
+---
+
+## 7. Conclusion
+
+By mixing **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
+
+1. **`css_selector`** – Basic scoping to an element or region.  
+2. **`word_count_threshold`** – Skip short blocks.  
+3. **`excluded_tags`** – Remove entire HTML tags.  
+4. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains.  
+5. **`exclude_external_images`** – Remove images from external sources.  
+6. **`process_iframes`** – Merge iframe content if needed.  
+
+Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max!
\ No newline at end of file
diff --git a/docs/md_v2/core/crawler-result.md b/docs/md_v2/core/crawler-result.md
new file mode 100644
index 00000000..e0e627ba
--- /dev/null
+++ b/docs/md_v2/core/crawler-result.md
@@ -0,0 +1,246 @@
+# Crawl Result and Output
+
+When you call `arun()` on a page, Crawl4AI returns a **`CrawlResult`** object containing everything you might need—raw HTML, a cleaned version, optional screenshots or PDFs, structured extraction results, and more. This document explains those fields and how they map to different output types.  
+
+---
+
+## 1. The `CrawlResult` Model
+
+Below is the core schema. Each field captures a different aspect of the crawl’s result:
+
+```python
+class MarkdownGenerationResult(BaseModel):
+    raw_markdown: str
+    markdown_with_citations: str
+    references_markdown: str
+    fit_markdown: Optional[str] = None
+    fit_html: Optional[str] = None
+
+class CrawlResult(BaseModel):
+    url: str
+    html: str
+    success: bool
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {}
+    links: Dict[str, List[Dict]] = {}
+    downloaded_files: Optional[List[str]] = None
+    screenshot: Optional[str] = None
+    pdf : Optional[bytes] = None
+    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
+    markdown_v2: Optional[MarkdownGenerationResult] = None
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
+    session_id: Optional[str] = None
+    response_headers: Optional[dict] = None
+    status_code: Optional[int] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    class Config:
+        arbitrary_types_allowed = True
+```
+
+### Table: Key Fields in `CrawlResult`
+
+| Field (Name & Type)                       | Description                                                                                         |
+|-------------------------------------------|-----------------------------------------------------------------------------------------------------|
+| **url (`str`)**                           | The final or actual URL crawled (in case of redirects).                                             |
+| **html (`str`)**                          | Original, unmodified page HTML. Good for debugging or custom processing.                            |
+| **success (`bool`)**                      | `True` if the crawl completed without major errors, else `False`.                                   |
+| **cleaned_html (`Optional[str]`)**        | Sanitized HTML with scripts/styles removed; can exclude tags if configured via `excluded_tags` etc. |
+| **media (`Dict[str, List[Dict]]`)**       | Extracted media info (images, audio, etc.), each with attributes like `src`, `alt`, `score`, etc.   |
+| **links (`Dict[str, List[Dict]]`)**       | Extracted link data, split by `internal` and `external`. Each link usually has `href`, `text`, etc. |
+| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads.         |
+| **screenshot (`Optional[str]`)**          | Screenshot of the page (base64-encoded) if `screenshot=True`.                                       |
+| **pdf (`Optional[bytes]`)**               | PDF of the page if `pdf=True`.                                                                      |
+| **markdown (`Optional[str or MarkdownGenerationResult]`)** | For now, `markdown_v2` holds a `MarkdownGenerationResult`. Over time, this will be consolidated into `markdown`. The generator can provide raw markdown, citations, references, and optionally `fit_markdown`. |
+| **markdown_v2 (`Optional[MarkdownGenerationResult]`)** | Legacy field for detailed markdown output. This will be replaced by `markdown` soon.                |
+| **extracted_content (`Optional[str]`)**   | The output of a structured extraction (CSS/LLM-based) stored as JSON string or other text.          |
+| **metadata (`Optional[dict]`)**           | Additional info about the crawl or extracted data.                                                  |
+| **error_message (`Optional[str]`)**       | If `success=False`, contains a short description of what went wrong.                                |
+| **session_id (`Optional[str]`)**          | The ID of the session used for multi-page or persistent crawling.                                   |
+| **response_headers (`Optional[dict]`)**   | HTTP response headers, if captured.                                                                 |
+| **status_code (`Optional[int]`)**         | HTTP status code (e.g., 200 for OK).                                                                |
+| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`.                                               |
+
+---
+
+## 2. HTML Variants
+
+### `html`: Raw HTML
+
+Crawl4AI preserves the exact HTML as `result.html`. Useful for:
+
+- Debugging page issues or checking the original content.
+- Performing your own specialized parse if needed.
+
+### `cleaned_html`: Sanitized
+
+If you specify any cleanup or exclusion parameters in `CrawlerRunConfig` (like `excluded_tags`, `remove_forms`, etc.), you’ll see the result here:
+
+```python
+config = CrawlerRunConfig(
+    excluded_tags=["form", "header", "footer"],
+    keep_data_attributes=False
+)
+result = await crawler.arun("https://example.com", config=config)
+print(result.cleaned_html)  # Freed of forms, header, footer, data-* attributes
+```
+
+---
+
+## 3. Markdown Generation
+
+### 3.1 `markdown_v2` (Legacy) vs `markdown`
+
+- **`markdown_v2`**: The current location for detailed markdown output, returning a **`MarkdownGenerationResult`** object.  
+- **`markdown`**: Eventually, we’re merging these fields. For now, you might see `result.markdown_v2` used widely in code examples.
+
+**`MarkdownGenerationResult`** Fields:
+
+| Field                   | Description                                                                    |
+|-------------------------|--------------------------------------------------------------------------------|
+| **raw_markdown**        | The basic HTML→Markdown conversion.                                            |
+| **markdown_with_citations** | Markdown including inline citations that reference links at the end.        |
+| **references_markdown** | The references/citations themselves (if `citations=True`).                      |
+| **fit_markdown**        | The filtered/“fit” markdown if a content filter was used.                       |
+| **fit_html**            | The filtered HTML that generated `fit_markdown`.                                |
+
+### 3.2 Basic Example with a Markdown Generator
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+config = CrawlerRunConfig(
+    markdown_generator=DefaultMarkdownGenerator(
+        options={"citations": True, "body_width": 80}  # e.g. pass html2text style options
+    )
+)
+result = await crawler.arun(url="https://example.com", config=config)
+
+md_res = result.markdown_v2  # or eventually 'result.markdown'
+print(md_res.raw_markdown[:500])
+print(md_res.markdown_with_citations)
+print(md_res.references_markdown)
+```
+
+**Note**: If you use a filter like `PruningContentFilter`, you’ll get `fit_markdown` and `fit_html` as well.
+
+---
+
+## 4. Structured Extraction: `extracted_content`
+
+If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structured data is **not** stored in `markdown`—it’s placed in **`result.extracted_content`** as a JSON string (or sometimes plain text).
+
+### Example: CSS Extraction with `raw://` HTML
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    schema = {
+        "name": "Example Items",
+        "baseSelector": "div.item",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+        ]
+    }
+    raw_html = "<div class='item'><h2>Item 1</h2><a href='https://example.com/item1'>Link 1</a></div>"
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="raw://" + raw_html,
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                extraction_strategy=JsonCssExtractionStrategy(schema)
+            )
+        )
+        data = json.loads(result.extracted_content)
+        print(data)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+Here:
+- `url="raw://..."` passes the HTML content directly, no network requests.  
+- The **CSS** extraction strategy populates `result.extracted_content` with the JSON array `[{"title": "...", "link": "..."}]`.
+
+---
+
+## 5. More Fields: Links, Media, and More
+
+### 5.1 `links`
+
+A dictionary, typically with `"internal"` and `"external"` lists. Each entry might have `href`, `text`, `title`, etc. This is automatically captured if you haven’t disabled link extraction.
+
+```python
+print(result.links["internal"][:3])  # Show first 3 internal links
+```
+
+### 5.2 `media`
+
+Similarly, a dictionary with `"images"`, `"audio"`, `"video"`, etc. Each item could include `src`, `alt`, `score`, and more, if your crawler is set to gather them.
+
+```python
+images = result.media.get("images", [])
+for img in images:
+    print("Image URL:", img["src"], "Alt:", img.get("alt"))
+```
+
+### 5.3 `screenshot` and `pdf`
+
+If you set `screenshot=True` or `pdf=True` in **`CrawlerRunConfig`**, then:
+
+- `result.screenshot` contains a base64-encoded PNG string.  
+- `result.pdf` contains raw PDF bytes (you can write them to a file).
+
+```python
+with open("page.pdf", "wb") as f:
+    f.write(result.pdf)
+```
+
+### 5.4 `ssl_certificate`
+
+If `fetch_ssl_certificate=True`, `result.ssl_certificate` holds details about the site’s SSL cert, such as issuer, validity dates, etc.
+
+---
+
+## 6. Accessing These Fields
+
+After you run:
+
+```python
+result = await crawler.arun(url="https://example.com", config=some_config)
+```
+
+Check any field:
+
+```python
+if result.success:
+    print(result.status_code, result.response_headers)
+    print("Links found:", len(result.links.get("internal", [])))
+    if result.markdown_v2:
+        print("Markdown snippet:", result.markdown_v2.raw_markdown[:200])
+    if result.extracted_content:
+        print("Structured JSON:", result.extracted_content)
+else:
+    print("Error:", result.error_message)
+```
+
+**Remember**: Use `result.markdown_v2` for now. It will eventually become `result.markdown`.
+
+---
+
+## 7. Next Steps
+
+- **Markdown Generation**: Dive deeper into how to configure `DefaultMarkdownGenerator` and various filters.  
+- **Content Filtering**: Learn how to use `BM25ContentFilter` and `PruningContentFilter`.
+- **Session & Hooks**: If you want to manipulate the page or preserve state across multiple `arun()` calls, see the hooking or session docs.  
+- **LLM Extraction**: For complex or unstructured content requiring AI-driven parsing, check the LLM-based strategies doc.
+
+**Enjoy** exploring all that `CrawlResult` offers—whether you need raw HTML, sanitized output, markdown, or fully structured data, Crawl4AI has you covered!
\ No newline at end of file
diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/core/docker-deploymeny.md
similarity index 98%
rename from docs/md_v2/basic/docker-deploymeny.md
rename to docs/md_v2/core/docker-deploymeny.md
index 8cbc76c4..a3d0def1 100644
--- a/docs/md_v2/basic/docker-deploymeny.md
+++ b/docs/md_v2/core/docker-deploymeny.md
@@ -512,7 +512,7 @@ request = {
 
 ### Complete Examples
 
-1. **Advanced News Crawling**
+1. **Advanced News Crawling**
 ```python
 request = {
     "urls": "https://www.nbcnews.com/business",
@@ -529,7 +529,7 @@ request = {
 }
 ```
 
-2. **Anti-Detection Configuration**
+2. **Anti-Detection Configuration**
 ```python
 request = {
     "urls": "https://example.com",
@@ -545,7 +545,7 @@ request = {
 }
 ```
 
-3. **LLM Extraction with Custom Parameters**
+3. **LLM Extraction with Custom Parameters**
 ```python
 request = {
     "urls": "https://openai.com/pricing",
@@ -567,7 +567,7 @@ request = {
 }
 ```
 
-4. **Session-Based Dynamic Content**
+4. **Session-Based Dynamic Content**
 ```python
 request = {
     "urls": "https://example.com",
@@ -584,7 +584,7 @@ request = {
 }
 ```
 
-5. **Screenshot with Custom Timing**
+5. **Screenshot with Custom Timing**
 ```python
 request = {
     "urls": "https://example.com",
@@ -624,19 +624,19 @@ request = {
 
 ### Common Issues
 
-1. **Connection Refused**
+1. **Connection Refused**
    ```
    Error: Connection refused at localhost:11235
    ```
    Solution: Ensure the container is running and ports are properly mapped.
 
-2. **Resource Limits**
+2. **Resource Limits**
    ```
    Error: No available slots
    ```
    Solution: Increase MAX_CONCURRENT_TASKS or container resources.
 
-3. **GPU Access**
+3. **GPU Access**
    ```
    Error: GPU not found
    ```
@@ -656,17 +656,17 @@ docker logs [container_id]
 
 ## Best Practices 🌟
 
-1. **Resource Management**
+1. **Resource Management**
    - Set appropriate memory and CPU limits
    - Monitor resource usage via health endpoint
    - Use basic version for simple crawling tasks
 
-2. **Scaling**
+2. **Scaling**
    - Use multiple containers for high load
    - Implement proper load balancing
    - Monitor performance metrics
 
-3. **Security**
+3. **Security**
    - Use environment variables for sensitive data
    - Implement proper network isolation
    - Regular security updates
diff --git a/docs/md_v2/core/fit-markdown.md b/docs/md_v2/core/fit-markdown.md
new file mode 100644
index 00000000..6c894330
--- /dev/null
+++ b/docs/md_v2/core/fit-markdown.md
@@ -0,0 +1,248 @@
+# Fit Markdown with Pruning & BM25
+
+**Fit Markdown** is a specialized **filtered** version of your page’s markdown, focusing on the most relevant content. By default, Crawl4AI converts the entire HTML into a broad **raw_markdown**. With fit markdown, we apply a **content filter** algorithm (e.g., **Pruning** or **BM25**) to remove or rank low-value sections—such as repetitive sidebars, shallow text blocks, or irrelevancies—leaving a concise textual “core.”
+
+---
+
+## 1. How “Fit Markdown” Works
+
+### 1.1 The `content_filter`
+
+In **`CrawlerRunConfig`**, you can specify a **`content_filter`** to shape how content is pruned or ranked before final markdown generation. A filter’s logic is applied **before** or **during** the HTML→Markdown process, producing:
+
+- **`result.markdown_v2.raw_markdown`** (unfiltered)
+- **`result.markdown_v2.fit_markdown`** (filtered or “fit” version)
+- **`result.markdown_v2.fit_html`** (the corresponding HTML snippet that produced `fit_markdown`)
+
+> **Note**: We’re currently storing the result in `markdown_v2`, but eventually we’ll unify it as `result.markdown`.
+
+### 1.2 Common Filters
+
+1. **PruningContentFilter** – Scores each node by text density, link density, and tag importance, discarding those below a threshold.  
+2. **BM25ContentFilter** – Focuses on textual relevance using BM25 ranking, especially useful if you have a specific user query (e.g., “machine learning” or “food nutrition”).
+
+---
+
+## 2. PruningContentFilter
+
+**Pruning** discards less relevant nodes based on **text density, link density, and tag importance**. It’s a heuristic-based approach—if certain sections appear too “thin” or too “spammy,” they’re pruned.
+
+### 2.1 Usage Example
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+    # Step 1: Create a pruning filter
+    prune_filter = PruningContentFilter(
+        # Lower → more content retained, higher → more content pruned
+        threshold=0.45,           
+        # "fixed" or "dynamic"
+        threshold_type="dynamic",  
+        # Ignore nodes with <5 words
+        min_word_threshold=5      
+    )
+
+    # Step 2: Insert it into a Markdown Generator
+    md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)
+    
+    # Step 3: Pass it to CrawlerRunConfig
+    config = CrawlerRunConfig(
+        markdown_generator=md_generator
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com", 
+            config=config
+        )
+        
+        if result.success:
+            # 'fit_markdown' is your pruned content, focusing on "denser" text
+            print("Raw Markdown length:", len(result.markdown_v2.raw_markdown))
+            print("Fit Markdown length:", len(result.markdown_v2.fit_markdown))
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 2.2 Key Parameters
+
+- **`min_word_threshold`** (int): If a block has fewer words than this, it’s pruned.  
+- **`threshold_type`** (str):
+  - `"fixed"` → each node must exceed `threshold` (0–1).  
+  - `"dynamic"` → node scoring adjusts according to tag type, text/link density, etc.  
+- **`threshold`** (float, default ~0.48): The base or “anchor” cutoff.  
+
+**Algorithmic Factors**:
+
+- **Text density** – Encourages blocks that have a higher ratio of text to overall content.  
+- **Link density** – Penalizes sections that are mostly links.  
+- **Tag importance** – e.g., an `<article>` or `<p>` might be more important than a `<div>`.  
+- **Structural context** – If a node is deeply nested or in a suspected sidebar, it might be deprioritized.
+
+---
+
+## 3. BM25ContentFilter
+
+**BM25** is a classical text ranking algorithm often used in search engines. If you have a **user query** or rely on page metadata to derive a query, BM25 can identify which text chunks best match that query.
+
+### 3.1 Usage Example
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import BM25ContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+    # 1) A BM25 filter with a user query
+    bm25_filter = BM25ContentFilter(
+        user_query="startup fundraising tips",
+        # Adjust for stricter or looser results
+        bm25_threshold=1.2  
+    )
+
+    # 2) Insert into a Markdown Generator
+    md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
+    
+    # 3) Pass to crawler config
+    config = CrawlerRunConfig(
+        markdown_generator=md_generator
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com", 
+            config=config
+        )
+        if result.success:
+            print("Fit Markdown (BM25 query-based):")
+            print(result.markdown_v2.fit_markdown)
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 3.2 Parameters
+
+- **`user_query`** (str, optional): E.g. `"machine learning"`. If blank, the filter tries to glean a query from page metadata.  
+- **`bm25_threshold`** (float, default 1.0):  
+  - Higher → fewer chunks but more relevant.  
+  - Lower → more inclusive.  
+
+> In more advanced scenarios, you might see parameters like `use_stemming`, `case_sensitive`, or `priority_tags` to refine how text is tokenized or weighted.
+
+---
+
+## 4. Accessing the “Fit” Output
+
+After the crawl, your “fit” content is found in **`result.markdown_v2.fit_markdown`**. In future versions, it will be **`result.markdown.fit_markdown`**. Meanwhile:
+
+```python
+fit_md = result.markdown_v2.fit_markdown
+fit_html = result.markdown_v2.fit_html
+```
+
+If the content filter is **BM25**, you might see additional logic or references in `fit_markdown` that highlight relevant segments. If it’s **Pruning**, the text is typically well-cleaned but not necessarily matched to a query.
+
+---
+
+## 5. Code Patterns Recap
+
+### 5.1 Pruning
+
+```python
+prune_filter = PruningContentFilter(
+    threshold=0.5,
+    threshold_type="fixed",
+    min_word_threshold=10
+)
+md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)
+config = CrawlerRunConfig(markdown_generator=md_generator)
+# => result.markdown_v2.fit_markdown
+```
+
+### 5.2 BM25
+
+```python
+bm25_filter = BM25ContentFilter(
+    user_query="health benefits fruit",
+    bm25_threshold=1.2
+)
+md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
+config = CrawlerRunConfig(markdown_generator=md_generator)
+# => result.markdown_v2.fit_markdown
+```
+
+---
+
+## 6. Combining with “word_count_threshold” & Exclusions
+
+Remember you can also specify:
+
+```python
+config = CrawlerRunConfig(
+    word_count_threshold=10,
+    excluded_tags=["nav", "footer", "header"],
+    exclude_external_links=True,
+    markdown_generator=DefaultMarkdownGenerator(
+        content_filter=PruningContentFilter(threshold=0.5)
+    )
+)
+```
+
+Thus, **multi-level** filtering occurs:
+
+1. The crawler’s `excluded_tags` are removed from the HTML first.  
+2. The content filter (Pruning, BM25, or custom) prunes or ranks the remaining text blocks.  
+3. The final “fit” content is generated in `result.markdown_v2.fit_markdown`.
+
+---
+
+## 7. Custom Filters
+
+If you need a different approach (like a specialized ML model or site-specific heuristics), you can create a new class inheriting from `RelevantContentFilter` and implement `filter_content(html)`. Then inject it into your **markdown generator**:
+
+```python
+from crawl4ai.content_filter_strategy import RelevantContentFilter
+
+class MyCustomFilter(RelevantContentFilter):
+    def filter_content(self, html, min_word_threshold=None):
+        # parse HTML, implement custom logic
+        return [block for block in ... if ... some condition...]
+
+```
+
+**Steps**:
+
+1. Subclass `RelevantContentFilter`.  
+2. Implement `filter_content(...)`.  
+3. Use it in your `DefaultMarkdownGenerator(content_filter=MyCustomFilter(...))`.
+
+---
+
+## 8. Final Thoughts
+
+**Fit Markdown** is a crucial feature for:
+
+- **Summaries**: Quickly get the important text from a cluttered page.  
+- **Search**: Combine with **BM25** to produce content relevant to a query.  
+- **AI Pipelines**: Filter out boilerplate so LLM-based extraction or summarization runs on denser text.
+
+**Key Points**:
+- **PruningContentFilter**: Great if you just want the “meatiest” text without a user query.  
+- **BM25ContentFilter**: Perfect for query-based extraction or searching.  
+- Combine with **`excluded_tags`, `exclude_external_links`, `word_count_threshold`** to refine your final “fit” text.  
+- Fit markdown ends up in **`result.markdown_v2.fit_markdown`**; eventually **`result.markdown.fit_markdown`** in future versions.
+
+With these tools, you can **zero in** on the text that truly matters, ignoring spammy or boilerplate content, and produce a concise, relevant “fit markdown” for your AI or data pipelines. Happy pruning and searching!
+
+- Last Updated: 2025-01-01
\ No newline at end of file
diff --git a/docs/md_v2/core/installation.md b/docs/md_v2/core/installation.md
new file mode 100644
index 00000000..2e1fd431
--- /dev/null
+++ b/docs/md_v2/core/installation.md
@@ -0,0 +1,129 @@
+# Installation & Setup (2023 Edition)
+
+## 1. Basic Installation
+
+```bash
+pip install crawl4ai
+```
+
+This installs the **core** Crawl4AI library along with essential dependencies. **No** advanced features (like transformers or PyTorch) are included yet.
+
+## 2. Initial Setup & Diagnostics
+
+### 2.1 Run the Setup Command
+After installing, call:
+
+```bash
+crawl4ai-setup
+```
+
+**What does it do?**
+- Installs or updates required Playwright browsers (Chromium, Firefox, etc.)
+- Performs OS-level checks (e.g., missing libs on Linux)
+- Confirms your environment is ready to crawl
+
+### 2.2 Diagnostics
+Optionally, you can run **diagnostics** to confirm everything is functioning:
+
+```bash
+crawl4ai-doctor
+```
+
+This command attempts to:
+- Check Python version compatibility
+- Verify Playwright installation
+- Inspect environment variables or library conflicts
+
+If any issues arise, follow its suggestions (e.g., installing additional system packages) and re-run `crawl4ai-setup`.
+
+---
+
+## 3. Verifying Installation: A Simple Crawl (Skip this step if you already run `crawl4ai-doctor`)
+
+Below is a minimal Python script demonstrating a **basic** crawl. It uses our new **`BrowserConfig`** and **`CrawlerRunConfig`** for clarity, though no custom settings are passed in this example:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+        )
+        print(result.markdown[:300])  # Show the first 300 characters of extracted text
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Expected** outcome:
+- A headless browser session loads `example.com`
+- Crawl4AI returns ~300 characters of markdown.  
+If errors occur, rerun `crawl4ai-doctor` or manually ensure Playwright is installed correctly.
+
+---
+
+## 4. Advanced Installation (Optional)
+
+**Warning**: Only install these **if you truly need them**. They bring in larger dependencies, including big models, which can increase disk usage and memory load significantly.
+
+### 4.1 Torch, Transformers, or All
+
+- **Text Clustering (Torch)**  
+  ```bash
+  pip install crawl4ai[torch]
+  crawl4ai-setup
+  ```
+  Installs PyTorch-based features (e.g., cosine similarity or advanced semantic chunking).
+
+- **Transformers**  
+  ```bash
+  pip install crawl4ai[transformer]
+  crawl4ai-setup
+  ```
+  Adds Hugging Face-based summarization or generation strategies.
+
+- **All Features**  
+  ```bash
+  pip install crawl4ai[all]
+  crawl4ai-setup
+  ```
+
+#### (Optional) Pre-Fetching Models
+```bash
+crawl4ai-download-models
+```
+This step caches large models locally (if needed). **Only do this** if your workflow requires them.
+
+---
+
+## 5. Docker (Experimental)
+
+We provide a **temporary** Docker approach for testing. **It’s not stable and may break** with future releases. We plan a major Docker revamp in a future stable version, 2025 Q1. If you still want to try:
+
+```bash
+docker pull unclecode/crawl4ai:basic
+docker run -p 11235:11235 unclecode/crawl4ai:basic
+```
+
+You can then make POST requests to `http://localhost:11235/crawl` to perform crawls. **Production usage** is discouraged until our new Docker approach is ready (planned in Jan or Feb 2025).
+
+---
+
+## 6. Local Server Mode (Legacy)
+
+Some older docs mention running Crawl4AI as a local server. This approach has been **partially replaced** by the new Docker-based prototype and upcoming stable server release. You can experiment, but expect major changes. Official local server instructions will arrive once the new Docker architecture is finalized.
+
+---
+
+## Summary
+
+1. **Install** with `pip install crawl4ai` and run `crawl4ai-setup`.
+2. **Diagnose** with `crawl4ai-doctor` if you see errors.
+3. **Verify** by crawling `example.com` with minimal `BrowserConfig` + `CrawlerRunConfig`.
+4. **Advanced** features (Torch, Transformers) are **optional**—avoid them if you don’t need them (they significantly increase resource usage).
+5. **Docker** is **experimental**—use at your own risk until the stable version is released.
+6. **Local server** references in older docs are largely deprecated; a new solution is in progress.
+
+**Got questions?** Check [GitHub issues](https://github.com/unclecode/crawl4ai/issues) for updates or ask the community!
\ No newline at end of file
diff --git a/docs/md_v3/tutorials/link-media-analysis.md b/docs/md_v2/core/link-media.md
similarity index 87%
rename from docs/md_v3/tutorials/link-media-analysis.md
rename to docs/md_v2/core/link-media.md
index 229fad8d..ed56e8fb 100644
--- a/docs/md_v3/tutorials/link-media-analysis.md
+++ b/docs/md_v2/core/link-media.md
@@ -1,8 +1,4 @@
-Below is a **draft** of the **“Link & Media Analysis”** tutorial. It demonstrates how to access and filter links, handle domain restrictions, and manage media (especially images) using Crawl4AI’s configuration options. Feel free to adjust examples and text to match your exact workflow or preferences.
-
----
-
-# Link & Media Analysis
+# Link & Media 
 
 In this tutorial, you’ll learn how to:
 
@@ -12,7 +8,7 @@ In this tutorial, you’ll learn how to:
 4. Configure your crawler to exclude or prioritize certain images
 
 > **Prerequisites**  
-> - You have completed or are familiar with the [AsyncWebCrawler Basics](./async-webcrawler-basics.md) tutorial.  
+> - You have completed or are familiar with the [AsyncWebCrawler Basics](../core/simple-crawling.md) tutorial.  
 > - You can run Crawl4AI in your environment (Playwright, Python, etc.).
 
 ---
@@ -37,8 +33,10 @@ async with AsyncWebCrawler() as crawler:
     if result.success:
         internal_links = result.links.get("internal", [])
         external_links = result.links.get("external", [])
-        print(f"Found {len(internal_links)} internal links, {len(external_links)} external links.")
-        
+        print(f"Found {len(internal_links)} internal links.")
+        print(f"Found {len(internal_links)} external links.")
+        print(f"Found {len(result.media)} media items.")
+
         # Each link is typically a dictionary with fields like:
         # { "href": "...", "text": "...", "title": "...", "base_domain": "..." }
         if internal_links:
@@ -259,37 +257,20 @@ if __name__ == "__main__":
 
 ## 5. Common Pitfalls & Tips
 
-1. **Conflicting Flags**:  
+1. **Conflicting Flags**:  
    - `exclude_external_links=True` but then also specifying `exclude_social_media_links=True` is typically fine, but understand that the first setting already discards *all* external links. The second becomes somewhat redundant.  
    - `exclude_external_images=True` but want to keep some external images? Currently no partial domain-based setting for images, so you might need a custom approach or hook logic.
 
-2. **Relevancy Scores**:  
+2. **Relevancy Scores**:  
    - If your version of Crawl4AI or your scraping strategy includes an `img["score"]`, it’s typically a heuristic based on size, position, or content analysis. Evaluate carefully if you rely on it.
 
-3. **Performance**:  
+3. **Performance**:  
    - Excluding certain domains or external images can speed up your crawl, especially for large, media-heavy pages.  
    - If you want a “full” link map, do *not* exclude them. Instead, you can post-filter in your own code.
 
-4. **Social Media Lists**:  
+4. **Social Media Lists**:  
    - `exclude_social_media_links=True` typically references an internal list of known social domains like Facebook, Twitter, LinkedIn, etc. If you need to add or remove from that list, look for library settings or a local config file (depending on your version).
 
 ---
 
-## 6. Next Steps
-
-Now that you understand how to manage **Link & Media Analysis**, you can:
-
-- Fine-tune which links are stored or discarded in your final results  
-- Control which images (or other media) appear in `result.media`  
-- Filter out entire domains or social media platforms to keep your dataset relevant
-
-**Recommended Follow-Ups**:  
-- **[Advanced Features (Proxy, PDF, Screenshots)](./advanced-features.md)**: If you want to capture screenshots or save the page as a PDF for archival or debugging.  
-- **[Hooks & Custom Code](./hooks-custom.md)**: For more specialized logic, such as automated “infinite scroll” or repeated “Load More” button clicks.  
-- **Reference**: Check out [CrawlerRunConfig Reference](../../reference/configuration.md) for a comprehensive parameter list.
-
-**Last updated**: 2024-XX-XX
-
----
-
 **That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
\ No newline at end of file
diff --git a/docs/md_v2/basic/prefix-based-input.md b/docs/md_v2/core/local-files.md
similarity index 97%
rename from docs/md_v2/basic/prefix-based-input.md
rename to docs/md_v2/core/local-files.md
index 6dfae9d4..ddf27f8c 100644
--- a/docs/md_v2/basic/prefix-based-input.md
+++ b/docs/md_v2/core/local-files.md
@@ -14,7 +14,10 @@ from crawl4ai.async_configs import CrawlerRunConfig
 async def crawl_web():
     config = CrawlerRunConfig(bypass_cache=True)
     async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", config=config)
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/apple", 
+            config=config
+        )
         if result.success:
             print("Markdown Content:")
             print(result.markdown)
diff --git a/docs/md_v3/tutorials/markdown-basics.md b/docs/md_v2/core/markdown-generation.md
similarity index 75%
rename from docs/md_v3/tutorials/markdown-basics.md
rename to docs/md_v2/core/markdown-generation.md
index 48498709..98a30652 100644
--- a/docs/md_v3/tutorials/markdown-basics.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -1,7 +1,3 @@
-Below is a **draft** of the **Markdown Generation Basics** tutorial that incorporates your current Crawl4AI design and terminology. It introduces the default markdown generator, explains the concept of content filters (BM25 and Pruning), and covers the `MarkdownGenerationResult` object in a coherent, step-by-step manner. Adjust parameters or naming as needed to align with your actual codebase.
-
----
-
 # Markdown Generation Basics
 
 One of Crawl4AI’s core features is generating **clean, structured markdown** from web pages. Originally built to solve the problem of extracting only the “actual” content and discarding boilerplate or noise, Crawl4AI’s markdown system remains one of its biggest draws for AI workflows.
@@ -13,7 +9,7 @@ In this tutorial, you’ll learn:
 3. The difference between raw markdown (`result.markdown`) and filtered markdown (`fit_markdown`)  
 
 > **Prerequisites**  
-> - You’ve completed or read [AsyncWebCrawler Basics](./async-webcrawler-basics.md) to understand how to run a simple crawl.  
+> - You’ve completed or read [AsyncWebCrawler Basics](../core/simple-crawling.md) to understand how to run a simple crawl.  
 > - You know how to configure `CrawlerRunConfig`.
 
 ---
@@ -45,7 +41,7 @@ if __name__ == "__main__":
 ```
 
 **What’s happening?**  
-- `CrawlerRunConfig(markdown_generator=DefaultMarkdownGenerator())` instructs Crawl4AI to convert the final HTML into markdown at the end of each crawl.  
+- `CrawlerRunConfig( markdown_generator = DefaultMarkdownGenerator() )` instructs Crawl4AI to convert the final HTML into markdown at the end of each crawl.  
 - The resulting markdown is accessible via `result.markdown`.
 
 ---
@@ -166,25 +162,101 @@ prune_filter = PruningContentFilter(
 
 - **`threshold`**: Score boundary. Blocks below this score get removed.  
 - **`threshold_type`**:  
-  - `"fixed"`: Straight comparison (`score >= threshold` keeps the block).  
-  - `"dynamic"`: The filter adjusts threshold in a data-driven manner.  
+    - `"fixed"`: Straight comparison (`score >= threshold` keeps the block).  
+    - `"dynamic"`: The filter adjusts threshold in a data-driven manner.  
 - **`min_word_threshold`**: Discard blocks under N words as likely too short or unhelpful.
 
 **When to Use PruningContentFilter**  
 - You want a broad cleanup without a user query.  
 - The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction.
 
+### 4.3 LLMContentFilter
+
+For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import LLMContentFilter
+
+async def main():
+    # Initialize LLM filter with specific instruction
+    filter = LLMContentFilter(
+        provider="openai/gpt-4",  # or your preferred provider
+        api_token="your-api-token",  # or use environment variable
+        instruction="""
+        Focus on extracting the core educational content.
+        Include:
+        - Key concepts and explanations
+        - Important code examples
+        - Essential technical details
+        Exclude:
+        - Navigation elements
+        - Sidebars
+        - Footer content
+        Format the output as clean markdown with proper code blocks and headers.
+        """,
+        chunk_token_threshold=4096,  # Adjust based on your needs
+        verbose=True
+    )
+
+    config = CrawlerRunConfig(
+        content_filter=filter
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        print(result.fit_markdown)  # Filtered markdown content
+```
+
+**Key Features:**
+- **Intelligent Filtering**: Uses LLMs to understand and extract relevant content while maintaining context
+- **Customizable Instructions**: Tailor the filtering process with specific instructions
+- **Chunk Processing**: Handles large documents by processing them in chunks (controlled by `chunk_token_threshold`)
+- **Parallel Processing**: For better performance, use smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks
+
+**Two Common Use Cases:**
+
+1. **Exact Content Preservation**:
+```python
+filter = LLMContentFilter(
+    instruction="""
+    Extract the main educational content while preserving its original wording and substance completely.
+    1. Maintain the exact language and terminology
+    2. Keep all technical explanations and examples intact
+    3. Preserve the original flow and structure
+    4. Remove only clearly irrelevant elements like navigation menus and ads
+    """,
+    chunk_token_threshold=4096
+)
+```
+
+2. **Focused Content Extraction**:
+```python
+filter = LLMContentFilter(
+    instruction="""
+    Focus on extracting specific types of content:
+    - Technical documentation
+    - Code examples
+    - API references
+    Reformat the content into clear, well-structured markdown
+    """,
+    chunk_token_threshold=4096
+)
+```
+
+> **Performance Tip**: Set a smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks. The default value is infinity, which processes the entire content as a single chunk.
+
 ---
 
 ## 5. Using Fit Markdown
 
 When a content filter is active, the library produces two forms of markdown inside `result.markdown_v2` or (if using the simplified field) `result.markdown`:
 
-1. **`raw_markdown`**: The full unfiltered markdown.  
-2. **`fit_markdown`**: A “fit” version where the filter has removed or trimmed noisy segments.
+1. **`raw_markdown`**: The full unfiltered markdown.  
+2. **`fit_markdown`**: A “fit” version where the filter has removed or trimmed noisy segments.
 
 **Note**:  
-- In earlier examples, you may see references to `result.markdown_v2`. Depending on your library version, you might access `result.markdown`, `result.markdown_v2`, or an object named `MarkdownGenerationResult`. The idea is the same: you’ll have a raw version and a filtered (“fit”) version if a filter is used.
+> In earlier examples, you may see references to `result.markdown_v2`. Depending on your library version, you might access `result.markdown`, `result.markdown_v2`, or an object named `MarkdownGenerationResult`. The idea is the same: you’ll have a raw version and a filtered (“fit”) version if a filter is used.
 
 ```python
 import asyncio
@@ -251,8 +323,8 @@ Below is a **revised section** under “Combining Filters (BM25 + Pruning)” th
 
 You might want to **prune out** noisy boilerplate first (with `PruningContentFilter`), and then **rank what’s left** against a user query (with `BM25ContentFilter`). You don’t have to crawl the page twice. Instead:
 
-1. **First pass**: Apply `PruningContentFilter` directly to the raw HTML from `result.html` (the crawler’s downloaded HTML).  
-2. **Second pass**: Take the pruned HTML (or text) from step 1, and feed it into `BM25ContentFilter`, focusing on a user query.
+1. **First pass**: Apply `PruningContentFilter` directly to the raw HTML from `result.html` (the crawler’s downloaded HTML).  
+2. **Second pass**: Take the pruned HTML (or text) from step 1, and feed it into `BM25ContentFilter`, focusing on a user query.
 
 ### Two-Pass Example
 
@@ -296,7 +368,8 @@ async def main():
             language="english"
         )
         
-        bm25_chunks = bm25_filter.filter_content(pruned_html)  # returns a list of text chunks
+        # returns a list of text chunks
+        bm25_chunks = bm25_filter.filter_content(pruned_html)  
         
         if not bm25_chunks:
             print("Nothing matched the BM25 query after pruning.")
@@ -317,10 +390,10 @@ if __name__ == "__main__":
 
 ### What’s Happening?
 
-1. **Raw HTML**: We crawl once and store the raw HTML in `result.html`.  
-2. **PruningContentFilter**: Takes HTML + optional parameters. It extracts blocks of text or partial HTML, removing headings/sections deemed “noise.” It returns a **list of text chunks**.  
-3. **Combine or Transform**: We join these pruned chunks back into a single HTML-like string. (Alternatively, you could store them in a list for further logic—whatever suits your pipeline.)  
-4. **BM25ContentFilter**: We feed the pruned string into `BM25ContentFilter` with a user query. This second pass further narrows the content to chunks relevant to “machine learning.”
+1. **Raw HTML**: We crawl once and store the raw HTML in `result.html`.  
+2. **PruningContentFilter**: Takes HTML + optional parameters. It extracts blocks of text or partial HTML, removing headings/sections deemed “noise.” It returns a **list of text chunks**.  
+3. **Combine or Transform**: We join these pruned chunks back into a single HTML-like string. (Alternatively, you could store them in a list for further logic—whatever suits your pipeline.)  
+4. **BM25ContentFilter**: We feed the pruned string into `BM25ContentFilter` with a user query. This second pass further narrows the content to chunks relevant to “machine learning.”
 
 **No Re-Crawling**: We used `raw_html` from the first pass, so there’s no need to run `arun()` again—**no second network request**.
 
@@ -340,19 +413,19 @@ If your codebase or pipeline design allows applying multiple filters in one pass
 
 ## 8. Common Pitfalls & Tips
 
-1. **No Markdown Output?**  
+1. **No Markdown Output?**  
    - Make sure the crawler actually retrieved HTML. If the site is heavily JS-based, you may need to enable dynamic rendering or wait for elements.  
    - Check if your content filter is too aggressive. Lower thresholds or disable the filter to see if content reappears.
 
-2. **Performance Considerations**  
+2. **Performance Considerations**  
    - Very large pages with multiple filters can be slower. Consider `cache_mode` to avoid re-downloading.  
    - If your final use case is LLM ingestion, consider summarizing further or chunking big texts.
 
-3. **Take Advantage of `fit_markdown`**  
+3. **Take Advantage of `fit_markdown`**  
    - Great for RAG pipelines, semantic search, or any scenario where extraneous boilerplate is unwanted.  
    - Still verify the textual quality—some sites have crucial data in footers or sidebars.
 
-4. **Adjusting `html2text` Options**  
+4. **Adjusting `html2text` Options**  
    - If you see lots of raw HTML slipping into the text, turn on `escape_html`.  
    - If code blocks look messy, experiment with `mark_code` or `handle_code_in_pre`.
 
@@ -367,16 +440,6 @@ In this **Markdown Generation Basics** tutorial, you learned to:
 - Distinguish between raw and filtered markdown (`fit_markdown`).  
 - Leverage the `MarkdownGenerationResult` object to handle different forms of output (citations, references, etc.).
 
-**Where to go from here**:
-
-- **[Extracting JSON (No LLM)](./json-extraction-basic.md)**: If you need structured data instead of markdown, check out the library’s JSON extraction strategies.  
-- **[Advanced Features](./advanced-features.md)**: Combine markdown generation with proxies, PDF exports, and more.  
-- **[Explanations → Content Filters vs. Extraction Strategies](../../explanations/extraction-chunking.md)**: Dive deeper into how filters differ from chunking or semantic extraction.  
-
 Now you can produce high-quality Markdown from any website, focusing on exactly the content you need—an essential step for powering AI models, summarization pipelines, or knowledge-base queries.
 
-**Last Updated**: 2024-XX-XX
-
----
-
-That’s it for **Markdown Generation Basics**! Enjoy generating clean, noise-free markdown for your LLM workflows, content archives, or research.
\ No newline at end of file
+**Last Updated**: 2025-01-01
diff --git a/docs/md_v2/core/page-interaction.md b/docs/md_v2/core/page-interaction.md
new file mode 100644
index 00000000..5fadc692
--- /dev/null
+++ b/docs/md_v2/core/page-interaction.md
@@ -0,0 +1,343 @@
+# Page Interaction
+
+Crawl4AI provides powerful features for interacting with **dynamic** webpages, handling JavaScript execution, waiting for conditions, and managing multi-step flows. By combining **js_code**, **wait_for**, and certain **CrawlerRunConfig** parameters, you can:
+
+1. Click “Load More” buttons  
+2. Fill forms and submit them  
+3. Wait for elements or data to appear  
+4. Reuse sessions across multiple steps  
+
+Below is a quick overview of how to do it.
+
+---
+
+## 1. JavaScript Execution
+
+### Basic Execution
+
+**`js_code`** in **`CrawlerRunConfig`** accepts either a single JS string or a list of JS snippets.  
+**Example**: We’ll scroll to the bottom of the page, then optionally click a “Load More” button.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Single JS command
+    config = CrawlerRunConfig(
+        js_code="window.scrollTo(0, document.body.scrollHeight);"
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",  # Example site
+            config=config
+        )
+        print("Crawled length:", len(result.cleaned_html))
+
+    # Multiple commands
+    js_commands = [
+        "window.scrollTo(0, document.body.scrollHeight);",
+        # 'More' link on Hacker News
+        "document.querySelector('a.morelink')?.click();",  
+    ]
+    config = CrawlerRunConfig(js_code=js_commands)
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",  # Another pass
+            config=config
+        )
+        print("After scroll+click, length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Relevant `CrawlerRunConfig` params**:
+- **`js_code`**: A string or list of strings with JavaScript to run after the page loads.
+- **`js_only`**: If set to `True` on subsequent calls, indicates we’re continuing an existing session without a new full navigation.  
+- **`session_id`**: If you want to keep the same page across multiple calls, specify an ID.
+
+---
+
+## 2. Wait Conditions
+
+### 2.1 CSS-Based Waiting
+
+Sometimes, you just want to wait for a specific element to appear. For example:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        # Wait for at least 30 items on Hacker News
+        wait_for="css:.athing:nth-child(30)"  
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=config
+        )
+        print("We have at least 30 items loaded!")
+        # Rough check
+        print("Total items in HTML:", result.cleaned_html.count("athing"))  
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key param**:
+- **`wait_for="css:..."`**: Tells the crawler to wait until that CSS selector is present.
+
+### 2.2 JavaScript-Based Waiting
+
+For more complex conditions (e.g., waiting for content length to exceed a threshold), prefix `js:`:
+
+```python
+wait_condition = """() => {
+    const items = document.querySelectorAll('.athing');
+    return items.length > 50;  // Wait for at least 51 items
+}"""
+
+config = CrawlerRunConfig(wait_for=f"js:{wait_condition}")
+```
+
+**Behind the Scenes**: Crawl4AI keeps polling the JS function until it returns `true` or a timeout occurs.
+
+---
+
+## 3. Handling Dynamic Content
+
+Many modern sites require **multiple steps**: scrolling, clicking “Load More,” or updating via JavaScript. Below are typical patterns.
+
+### 3.1 Load More Example (Hacker News “More” Link)
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Step 1: Load initial Hacker News page
+    config = CrawlerRunConfig(
+        wait_for="css:.athing:nth-child(30)"  # Wait for 30 items
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=config
+        )
+        print("Initial items loaded.")
+
+        # Step 2: Let's scroll and click the "More" link
+        load_more_js = [
+            "window.scrollTo(0, document.body.scrollHeight);",
+            # The "More" link at page bottom
+            "document.querySelector('a.morelink')?.click();"  
+        ]
+        
+        next_page_conf = CrawlerRunConfig(
+            js_code=load_more_js,
+            wait_for="""js:() => {
+                return document.querySelectorAll('.athing').length > 30;
+            }""",
+            # Mark that we do not re-navigate, but run JS in the same session:
+            js_only=True,
+            session_id="hn_session"
+        )
+
+        # Re-use the same crawler session
+        result2 = await crawler.arun(
+            url="https://news.ycombinator.com",  # same URL but continuing session
+            config=next_page_conf
+        )
+        total_items = result2.cleaned_html.count("athing")
+        print("Items after load-more:", total_items)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key params**:
+- **`session_id="hn_session"`**: Keep the same page across multiple calls to `arun()`.
+- **`js_only=True`**: We’re not performing a full reload, just applying JS in the existing page.
+- **`wait_for`** with `js:`: Wait for item count to grow beyond 30.
+
+---
+
+### 3.2 Form Interaction
+
+If the site has a search or login form, you can fill fields and submit them with **`js_code`**. For instance, if GitHub had a local search form:
+
+```python
+js_form_interaction = """
+document.querySelector('#your-search').value = 'TypeScript commits';
+document.querySelector('form').submit();
+"""
+
+config = CrawlerRunConfig(
+    js_code=js_form_interaction,
+    wait_for="css:.commit"
+)
+result = await crawler.arun(url="https://github.com/search", config=config)
+```
+
+**In reality**: Replace IDs or classes with the real site’s form selectors.
+
+---
+
+## 4. Timing Control
+
+1. **`page_timeout`** (ms): Overall page load or script execution time limit.  
+2. **`delay_before_return_html`** (seconds): Wait an extra moment before capturing the final HTML.  
+3. **`mean_delay`** & **`max_range`**: If you call `arun_many()` with multiple URLs, these add a random pause between each request.
+
+**Example**:
+
+```python
+config = CrawlerRunConfig(
+    page_timeout=60000,  # 60s limit
+    delay_before_return_html=2.5
+)
+```
+
+---
+
+## 5. Multi-Step Interaction Example
+
+Below is a simplified script that does multiple “Load More” clicks on GitHub’s TypeScript commits page. It **re-uses** the same session to accumulate new commits each time. The code includes the relevant **`CrawlerRunConfig`** parameters you’d rely on.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def multi_page_commits():
+    browser_cfg = BrowserConfig(
+        headless=False,  # Visible for demonstration
+        verbose=True
+    )
+    session_id = "github_ts_commits"
+    
+    base_wait = """js:() => {
+        const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+        return commits.length > 0;
+    }"""
+
+    # Step 1: Load initial commits
+    config1 = CrawlerRunConfig(
+        wait_for=base_wait,
+        session_id=session_id,
+        cache_mode=CacheMode.BYPASS,
+        # Not using js_only yet since it's our first load
+    )
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        result = await crawler.arun(
+            url="https://github.com/microsoft/TypeScript/commits/main",
+            config=config1
+        )
+        print("Initial commits loaded. Count:", result.cleaned_html.count("commit"))
+
+        # Step 2: For subsequent pages, we run JS to click 'Next Page' if it exists
+        js_next_page = """
+        const selector = 'a[data-testid="pagination-next-button"]';
+        const button = document.querySelector(selector);
+        if (button) button.click();
+        """
+        
+        # Wait until new commits appear
+        wait_for_more = """js:() => {
+            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+            if (!window.firstCommit && commits.length>0) {
+                window.firstCommit = commits[0].textContent;
+                return false;
+            }
+            // If top commit changes, we have new commits
+            const topNow = commits[0]?.textContent.trim();
+            return topNow && topNow !== window.firstCommit;
+        }"""
+
+        for page in range(2):  # let's do 2 more "Next" pages
+            config_next = CrawlerRunConfig(
+                session_id=session_id,
+                js_code=js_next_page,
+                wait_for=wait_for_more,
+                js_only=True,       # We're continuing from the open tab
+                cache_mode=CacheMode.BYPASS
+            )
+            result2 = await crawler.arun(
+                url="https://github.com/microsoft/TypeScript/commits/main",
+                config=config_next
+            )
+            print(f"Page {page+2} commits count:", result2.cleaned_html.count("commit"))
+
+        # Optionally kill session
+        await crawler.crawler_strategy.kill_session(session_id)
+
+async def main():
+    await multi_page_commits()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**:
+
+- **`session_id`**: Keep the same page open.  
+- **`js_code`** + **`wait_for`** + **`js_only=True`**: We do partial refreshes, waiting for new commits to appear.  
+- **`cache_mode=CacheMode.BYPASS`** ensures we always see fresh data each step.
+
+---
+
+## 6. Combine Interaction with Extraction
+
+Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+schema = {
+    "name": "Commits",
+    "baseSelector": "li.Box-sc-g0xbh4-0",
+    "fields": [
+        {"name": "title", "selector": "h4.markdown-title", "type": "text"}
+    ]
+}
+config = CrawlerRunConfig(
+    session_id="ts_commits_session",
+    js_code=js_next_page,
+    wait_for=wait_for_more,
+    extraction_strategy=JsonCssExtractionStrategy(schema)
+)
+```
+
+When done, check `result.extracted_content` for the JSON.
+
+---
+
+## 7. Relevant `CrawlerRunConfig` Parameters
+
+Below are the key interaction-related parameters in `CrawlerRunConfig`. For a full list, see [Configuration Parameters](../api/parameters.md).
+
+- **`js_code`**: JavaScript to run after initial load.  
+- **`js_only`**: If `True`, no new page navigation—only JS in the existing session.  
+- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.  
+- **`session_id`**: Reuse the same page across calls.  
+- **`cache_mode`**: Whether to read/write from the cache or bypass.  
+- **`remove_overlay_elements`**: Remove certain popups automatically.  
+- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or “human-like” interactions.
+
+---
+
+## 8. Conclusion
+
+Crawl4AI’s **page interaction** features let you:
+
+1. **Execute JavaScript** for scrolling, clicks, or form filling.  
+2. **Wait** for CSS or custom JS conditions before capturing data.  
+3. **Handle** multi-step flows (like “Load More”) with partial reloads or persistent sessions.  
+4. Combine with **structured extraction** for dynamic sites.
+
+With these tools, you can scrape modern, interactive webpages confidently. For advanced hooking, user simulation, or in-depth config, check the [API reference](../api/parameters.md) or related advanced docs. Happy scripting!
\ No newline at end of file
diff --git a/docs/md_v2/core/quickstart.md b/docs/md_v2/core/quickstart.md
new file mode 100644
index 00000000..04614533
--- /dev/null
+++ b/docs/md_v2/core/quickstart.md
@@ -0,0 +1,437 @@
+# Getting Started with Crawl4AI
+
+Welcome to **Crawl4AI**, an open-source LLM-friendly Web Crawler & Scraper. In this tutorial, you’ll:
+
+1. Run your **first crawl** using minimal configuration.  
+2. Generate **Markdown** output (and learn how it’s influenced by content filters).  
+3. Experiment with a simple **CSS-based extraction** strategy.  
+4. See a glimpse of **LLM-based extraction** (including open-source and closed-source model options).  
+5. Crawl a **dynamic** page that loads content via JavaScript.
+
+---
+
+## 1. Introduction
+
+Crawl4AI provides:
+
+- An asynchronous crawler, **`AsyncWebCrawler`**.  
+- Configurable browser and run settings via **`BrowserConfig`** and **`CrawlerRunConfig`**.  
+- Automatic HTML-to-Markdown conversion via **`DefaultMarkdownGenerator`** (supports optional filters).  
+- Multiple extraction strategies (LLM-based or “traditional” CSS/XPath-based).
+
+By the end of this guide, you’ll have performed a basic crawl, generated Markdown, tried out two extraction strategies, and crawled a dynamic page that uses “Load More” buttons or JavaScript updates.
+
+---
+
+## 2. Your First Crawl
+
+Here’s a minimal Python script that creates an **`AsyncWebCrawler`**, fetches a webpage, and prints the first 300 characters of its Markdown output:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com")
+        print(result.markdown[:300])  # Print first 300 chars
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What’s happening?**
+- **`AsyncWebCrawler`** launches a headless browser (Chromium by default).
+- It fetches `https://example.com`.
+- Crawl4AI automatically converts the HTML into Markdown.
+
+You now have a simple, working crawl!
+
+---
+
+## 3. Basic Configuration (Light Introduction)
+
+Crawl4AI’s crawler can be heavily customized using two main classes:
+
+1. **`BrowserConfig`**: Controls browser behavior (headless or full UI, user agent, JavaScript toggles, etc.).  
+2. **`CrawlerRunConfig`**: Controls how each crawl runs (caching, extraction, timeouts, hooking, etc.).
+
+Below is an example with minimal usage:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def main():
+    browser_conf = BrowserConfig(headless=True)  # or False to see the browser
+    run_conf = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler(config=browser_conf) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_conf
+        )
+        print(result.markdown)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS`
+
+We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.
+
+---
+
+## 4. Generating Markdown Output
+
+By default, Crawl4AI automatically generates Markdown from each crawled page. However, the exact output depends on whether you specify a **markdown generator** or **content filter**.
+
+- **`result.markdown`**:  
+  The direct HTML-to-Markdown conversion.  
+- **`result.markdown.fit_markdown`**:  
+  The same content after applying any configured **content filter** (e.g., `PruningContentFilter`).
+
+### Example: Using a Filter with `DefaultMarkdownGenerator`
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+md_generator = DefaultMarkdownGenerator(
+    content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
+)
+
+config = CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS,
+    markdown_generator=md_generator
+)
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun("https://news.ycombinator.com", config=config)
+    print("Raw Markdown length:", len(result.markdown.raw_markdown))
+    print("Fit Markdown length:", len(result.markdown.fit_markdown))
+```
+
+**Note**: If you do **not** specify a content filter or markdown generator, you’ll typically see only the raw Markdown. `PruningContentFilter` may adds around `50ms` in processing time. We’ll dive deeper into these strategies in a dedicated **Markdown Generation** tutorial.
+
+---
+
+## 5. Simple Data Extraction (CSS-based)
+
+Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. Below is a minimal CSS-based example:
+
+> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+# Generate a schema (one-time cost)
+html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
+
+# Using OpenAI (requires API token)
+schema = JsonCssExtractionStrategy.generate_schema(
+    html,
+    llm_provider="openai/gpt-4o",  # Default provider
+    api_token="your-openai-token"  # Required for OpenAI
+)
+
+# Or using Ollama (open source, no token needed)
+schema = JsonCssExtractionStrategy.generate_schema(
+    html,
+    llm_provider="ollama/llama3.3",  # Open source alternative
+    api_token=None  # Not needed for Ollama
+)
+
+# Use the schema for fast, repeated extractions
+strategy = JsonCssExtractionStrategy(schema)
+```
+
+For a complete guide on schema generation and advanced usage, see [No-LLM Extraction Strategies](../extraction/no-llm-strategies.md).
+
+Here's a basic extraction example:
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    schema = {
+        "name": "Example Items",
+        "baseSelector": "div.item",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+        ]
+    }
+
+    raw_html = "<div class='item'><h2>Item 1</h2><a href='https://example.com/item1'>Link 1</a></div>"
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="raw://" + raw_html,
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                extraction_strategy=JsonCssExtractionStrategy(schema)
+            )
+        )
+        # The JSON output is stored in 'extracted_content'
+        data = json.loads(result.extracted_content)
+        print(data)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Why is this helpful?**
+- Great for repetitive page structures (e.g., item listings, articles).
+- No AI usage or costs.
+- The crawler returns a JSON string you can parse or store.
+
+> Tips: You can pass raw HTML to the crawler instead of a URL. To do so, prefix the HTML with `raw://`.
+
+---
+
+## 6. Simple Data Extraction (LLM-based)
+
+For more complex or irregular pages, a language model can parse text intelligently into a structure you define. Crawl4AI supports **open-source** or **closed-source** providers:
+
+- **Open-Source Models** (e.g., `ollama/llama3.3`, `no_token`)  
+- **OpenAI Models** (e.g., `openai/gpt-4`, requires `api_token`)  
+- Or any provider supported by the underlying library
+
+Below is an example using **open-source** style (no token) and closed-source:
+
+```python
+import os
+import json
+import asyncio
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
+        return
+
+    browser_config = BrowserConfig(headless=True)
+
+    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+    if extra_headers:
+        extra_args["extra_headers"] = extra_headers
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=1,
+        page_timeout=80000,
+        extraction_strategy=LLMExtractionStrategy(
+            provider=provider,
+            api_token=api_token,
+            schema=OpenAIModelFee.model_json_schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content.""",
+            extra_args=extra_args,
+        ),
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://openai.com/api/pricing/", config=crawler_config
+        )
+        print(result.extracted_content)
+
+if __name__ == "__main__":
+    # Use ollama with llama3.3
+    # asyncio.run(
+    #     extract_structured_data_using_llm(
+    #         provider="ollama/llama3.3", api_token="no-token"
+    #     )
+    # )
+
+    asyncio.run(
+        extract_structured_data_using_llm(
+            provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
+        )
+    )
+```
+
+**What’s happening?**
+- We define a Pydantic schema (`PricingInfo`) describing the fields we want.
+- The LLM extraction strategy uses that schema and your instructions to transform raw text into structured JSON.
+- Depending on the **provider** and **api_token**, you can use local models or a remote API.
+
+---
+
+## 7. Multi-URL Concurrency (Preview)
+
+If you need to crawl multiple URLs in **parallel**, you can use `arun_many()`. By default, Crawl4AI employs a **MemoryAdaptiveDispatcher**, automatically adjusting concurrency based on system resources. Here’s a quick glimpse:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def quick_parallel_example():
+    urls = [
+        "https://example.com/page1",
+        "https://example.com/page2",
+        "https://example.com/page3"
+    ]
+    
+    run_conf = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=True  # Enable streaming mode
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        # Stream results as they complete
+        async for result in await crawler.arun_many(urls, config=run_conf):
+            if result.success:
+                print(f"[OK] {result.url}, length: {len(result.markdown_v2.raw_markdown)}")
+            else:
+                print(f"[ERROR] {result.url} => {result.error_message}")
+
+        # Or get all results at once (default behavior)
+        run_conf = run_conf.clone(stream=False)
+        results = await crawler.arun_many(urls, config=run_conf)
+        for res in results:
+            if res.success:
+                print(f"[OK] {res.url}, length: {len(res.markdown_v2.raw_markdown)}")
+            else:
+                print(f"[ERROR] {res.url} => {res.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(quick_parallel_example())
+```
+
+The example above shows two ways to handle multiple URLs:
+1. **Streaming mode** (`stream=True`): Process results as they become available using `async for`
+2. **Batch mode** (`stream=False`): Wait for all results to complete
+
+For more advanced concurrency (e.g., a **semaphore-based** approach, **adaptive memory usage throttling**, or customized rate limiting), see [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md).
+
+---
+
+## 8. Dynamic Content Example
+
+Some sites require multiple “page clicks” or dynamic JavaScript updates. Below is an example showing how to **click** a “Next Page” button and wait for new commits to load on GitHub, using **`BrowserConfig`** and **`CrawlerRunConfig`**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def extract_structured_data_using_css_extractor():
+    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+    schema = {
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .w-tab-content > div",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    js_click_tabs = """
+    (async () => {
+        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+        for(let tab of tabs) {
+            tab.scrollIntoView();
+            tab.click();
+            await new Promise(r => setTimeout(r, 500));
+        }
+    })();
+    """
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        js_code=[js_click_tabs],
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.kidocode.com/degrees/technology", config=crawler_config
+        )
+
+        companies = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(companies)} companies")
+        print(json.dumps(companies[0], indent=2))
+
+async def main():
+    await extract_structured_data_using_css_extractor()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**:
+
+- **`BrowserConfig(headless=False)`**: We want to watch it click “Next Page.”  
+- **`CrawlerRunConfig(...)`**: We specify the extraction strategy, pass `session_id` to reuse the same page.  
+- **`js_code`** and **`wait_for`** are used for subsequent pages (`page > 0`) to click the “Next” button and wait for new commits to load.  
+- **`js_only=True`** indicates we’re not re-navigating but continuing the existing session.  
+- Finally, we call `kill_session()` to clean up the page and browser session.
+
+---
+
+## 9. Next Steps
+
+Congratulations! You have:
+
+1. Performed a basic crawl and printed Markdown.  
+2. Used **content filters** with a markdown generator.  
+3. Extracted JSON via **CSS** or **LLM** strategies.  
+4. Handled **dynamic** pages with JavaScript triggers.
+
+If you’re ready for more, check out:
+
+- **Installation**: A deeper dive into advanced installs, Docker usage (experimental), or optional dependencies.  
+- **Hooks & Auth**: Learn how to run custom JavaScript or handle logins with cookies, local storage, etc.  
+- **Deployment**: Explore ephemeral testing in Docker or plan for the upcoming stable Docker release.  
+- **Browser Management**: Delve into user simulation, stealth modes, and concurrency best practices.  
+
+Crawl4AI is a powerful, flexible tool. Enjoy building out your scrapers, data pipelines, or AI-driven extraction flows. Happy crawling!
\ No newline at end of file
diff --git a/docs/md_v2/basic/simple-crawling.md b/docs/md_v2/core/simple-crawling.md
similarity index 100%
rename from docs/md_v2/basic/simple-crawling.md
rename to docs/md_v2/core/simple-crawling.md
diff --git a/docs/md_v2/extraction/chunking.md b/docs/md_v2/extraction/chunking.md
index f429310f..2a04a60e 100644
--- a/docs/md_v2/extraction/chunking.md
+++ b/docs/md_v2/extraction/chunking.md
@@ -1,133 +1,144 @@
-## Chunking Strategies 📚
+# Chunking Strategies
+Chunking strategies are critical for dividing large texts into manageable parts, enabling effective content processing and extraction. These strategies are foundational in cosine similarity-based extraction techniques, which allow users to retrieve only the most relevant chunks of content for a given query. Additionally, they facilitate direct integration into RAG (Retrieval-Augmented Generation) systems for structured and scalable workflows.
 
-Crawl4AI provides several powerful chunking strategies to divide text into manageable parts for further processing. Each strategy has unique characteristics and is suitable for different scenarios. Let's explore them one by one.
+### Why Use Chunking?
+1. **Cosine Similarity and Query Relevance**: Prepares chunks for semantic similarity analysis.
+2. **RAG System Integration**: Seamlessly processes and stores chunks for retrieval.
+3. **Structured Processing**: Allows for diverse segmentation methods, such as sentence-based, topic-based, or windowed approaches.
 
-### RegexChunking
+### Methods of Chunking
 
-`RegexChunking` splits text using regular expressions. This is ideal for creating chunks based on specific patterns like paragraphs or sentences.
+#### 1. Regex-Based Chunking
+Splits text based on regular expression patterns, useful for coarse segmentation.
 
-#### When to Use
-- Great for structured text with consistent delimiters.
-- Suitable for documents where specific patterns (e.g., double newlines, periods) indicate logical chunks.
-
-#### Parameters
-- `patterns` (list, optional): Regular expressions used to split the text. Default is to split by double newlines (`['\n\n']`).
-
-#### Example
+**Code Example**:
 ```python
-from crawl4ai.chunking_strategy import RegexChunking
+class RegexChunking:
+    def __init__(self, patterns=None):
+        self.patterns = patterns or [r'\n\n']  # Default pattern for paragraphs
 
-# Define patterns for splitting text
-patterns = [r'\n\n', r'\. ']
-chunker = RegexChunking(patterns=patterns)
+    def chunk(self, text):
+        paragraphs = [text]
+        for pattern in self.patterns:
+            paragraphs = [seg for p in paragraphs for seg in re.split(pattern, p)]
+        return paragraphs
 
-# Sample text
-text = "This is a sample text. It will be split into chunks.\n\nThis is another paragraph."
+# Example Usage
+text = """This is the first paragraph.
 
-# Chunk the text
-chunks = chunker.chunk(text)
-print(chunks)
+This is the second paragraph."""
+chunker = RegexChunking()
+print(chunker.chunk(text))
 ```
 
-### NlpSentenceChunking
+#### 2. Sentence-Based Chunking
+Divides text into sentences using NLP tools, ideal for extracting meaningful statements.
 
-`NlpSentenceChunking` uses NLP models to split text into sentences, ensuring accurate sentence boundaries.
-
-#### When to Use
-- Ideal for texts where sentence boundaries are crucial.
-- Useful for creating chunks that preserve grammatical structures.
-
-#### Parameters
-- None.
-
-#### Example
+**Code Example**:
 ```python
-from crawl4ai.chunking_strategy import NlpSentenceChunking
+from nltk.tokenize import sent_tokenize
 
+class NlpSentenceChunking:
+    def chunk(self, text):
+        sentences = sent_tokenize(text)
+        return [sentence.strip() for sentence in sentences]
+
+# Example Usage
+text = "This is sentence one. This is sentence two."
 chunker = NlpSentenceChunking()
-
-# Sample text
-text = "This is a sample text. It will be split into sentences. Here's another sentence."
-
-# Chunk the text
-chunks = chunker.chunk(text)
-print(chunks)
+print(chunker.chunk(text))
 ```
 
-### TopicSegmentationChunking
+#### 3. Topic-Based Segmentation
+Uses algorithms like TextTiling to create topic-coherent chunks.
 
-`TopicSegmentationChunking` employs the TextTiling algorithm to segment text into topic-based chunks. This method identifies thematic boundaries.
-
-#### When to Use
-- Perfect for long documents with distinct topics.
-- Useful when preserving topic continuity is more important than maintaining text order.
-
-#### Parameters
-- `num_keywords` (int, optional): Number of keywords for each topic segment. Default is `3`.
-
-#### Example
+**Code Example**:
 ```python
-from crawl4ai.chunking_strategy import TopicSegmentationChunking
+from nltk.tokenize import TextTilingTokenizer
 
-chunker = TopicSegmentationChunking(num_keywords=3)
+class TopicSegmentationChunking:
+    def __init__(self):
+        self.tokenizer = TextTilingTokenizer()
 
-# Sample text
-text = "This document contains several topics. Topic one discusses AI. Topic two covers machine learning."
+    def chunk(self, text):
+        return self.tokenizer.tokenize(text)
 
-# Chunk the text
-chunks = chunker.chunk(text)
-print(chunks)
+# Example Usage
+text = """This is an introduction.
+This is a detailed discussion on the topic."""
+chunker = TopicSegmentationChunking()
+print(chunker.chunk(text))
 ```
 
-### FixedLengthWordChunking
+#### 4. Fixed-Length Word Chunking
+Segments text into chunks of a fixed word count.
 
-`FixedLengthWordChunking` splits text into chunks based on a fixed number of words. This ensures each chunk has approximately the same length.
-
-#### When to Use
-- Suitable for processing large texts where uniform chunk size is important.
-- Useful when the number of words per chunk needs to be controlled.
-
-#### Parameters
-- `chunk_size` (int, optional): Number of words per chunk. Default is `100`.
-
-#### Example
+**Code Example**:
 ```python
-from crawl4ai.chunking_strategy import FixedLengthWordChunking
+class FixedLengthWordChunking:
+    def __init__(self, chunk_size=100):
+        self.chunk_size = chunk_size
 
-chunker = FixedLengthWordChunking(chunk_size=10)
+    def chunk(self, text):
+        words = text.split()
+        return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)]
 
-# Sample text
-text = "This is a sample text. It will be split into chunks of fixed length."
-
-# Chunk the text
-chunks = chunker.chunk(text)
-print(chunks)
+# Example Usage
+text = "This is a long text with many words to be chunked into fixed sizes."
+chunker = FixedLengthWordChunking(chunk_size=5)
+print(chunker.chunk(text))
 ```
 
-### SlidingWindowChunking
+#### 5. Sliding Window Chunking
+Generates overlapping chunks for better contextual coherence.
 
-`SlidingWindowChunking` uses a sliding window approach to create overlapping chunks. Each chunk has a fixed length, and the window slides by a specified step size.
-
-#### When to Use
-- Ideal for creating overlapping chunks to preserve context.
-- Useful for tasks where context from adjacent chunks is needed.
-
-#### Parameters
-- `window_size` (int, optional): Number of words in each chunk. Default is `100`.
-- `step` (int, optional): Number of words to slide the window. Default is `50`.
-
-#### Example
+**Code Example**:
 ```python
-from crawl4ai.chunking_strategy import SlidingWindowChunking
+class SlidingWindowChunking:
+    def __init__(self, window_size=100, step=50):
+        self.window_size = window_size
+        self.step = step
 
-chunker = SlidingWindowChunking(window_size=10, step=5)
+    def chunk(self, text):
+        words = text.split()
+        chunks = []
+        for i in range(0, len(words) - self.window_size + 1, self.step):
+            chunks.append(' '.join(words[i:i + self.window_size]))
+        return chunks
 
-# Sample text
-text = "This is a sample text. It will be split using a sliding window approach to preserve context."
-
-# Chunk the text
-chunks = chunker.chunk(text)
-print(chunks)
+# Example Usage
+text = "This is a long text to demonstrate sliding window chunking."
+chunker = SlidingWindowChunking(window_size=5, step=2)
+print(chunker.chunk(text))
 ```
 
-With these chunking strategies, you can choose the best method to divide your text based on your specific needs. Whether you need precise sentence boundaries, topic-based segmentation, or uniform chunk sizes, Crawl4AI has you covered. Happy chunking! 📝✨
+### Combining Chunking with Cosine Similarity
+To enhance the relevance of extracted content, chunking strategies can be paired with cosine similarity techniques. Here’s an example workflow:
+
+**Code Example**:
+```python
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+class CosineSimilarityExtractor:
+    def __init__(self, query):
+        self.query = query
+        self.vectorizer = TfidfVectorizer()
+
+    def find_relevant_chunks(self, chunks):
+        vectors = self.vectorizer.fit_transform([self.query] + chunks)
+        similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
+        return [(chunks[i], similarities[i]) for i in range(len(chunks))]
+
+# Example Workflow
+text = """This is a sample document. It has multiple sentences. 
+We are testing chunking and similarity."""
+
+chunker = SlidingWindowChunking(window_size=5, step=3)
+chunks = chunker.chunk(text)
+query = "testing chunking"
+extractor = CosineSimilarityExtractor(query)
+relevant_chunks = extractor.find_relevant_chunks(chunks)
+
+print(relevant_chunks)
+```
diff --git a/docs/md_v2/extraction/cosine.md b/docs/md_v2/extraction/clustring-strategies.md
similarity index 96%
rename from docs/md_v2/extraction/cosine.md
rename to docs/md_v2/extraction/clustring-strategies.md
index 9ce49e40..3fe00fa1 100644
--- a/docs/md_v2/extraction/cosine.md
+++ b/docs/md_v2/extraction/clustring-strategies.md
@@ -56,12 +56,12 @@ CosineStrategy(
 
 ### Parameter Details
 
-1. **semantic_filter**
+1. **semantic_filter**
    - Sets the target topic or content type
    - Use keywords relevant to your desired content
    - Example: "technical specifications", "user reviews", "pricing information"
 
-2. **sim_threshold**
+2. **sim_threshold**
    - Controls how similar content must be to be grouped together
    - Higher values (e.g., 0.8) mean stricter matching
    - Lower values (e.g., 0.3) allow more variation
@@ -73,7 +73,7 @@ CosineStrategy(
    strategy = CosineStrategy(sim_threshold=0.3)
    ```
 
-3. **word_count_threshold**
+3. **word_count_threshold**
    - Filters out short content blocks
    - Helps eliminate noise and irrelevant content
    ```python
@@ -81,7 +81,7 @@ CosineStrategy(
    strategy = CosineStrategy(word_count_threshold=50)
    ```
 
-4. **top_k**
+4. **top_k**
    - Number of top content clusters to return
    - Higher values return more diverse content
    ```python
@@ -163,17 +163,17 @@ async def extract_pricing_features(url: str):
 
 ## Best Practices
 
-1. **Adjust Thresholds Iteratively**
+1. **Adjust Thresholds Iteratively**
    - Start with default values
    - Adjust based on results
    - Monitor clustering quality
 
-2. **Choose Appropriate Word Count Thresholds**
+2. **Choose Appropriate Word Count Thresholds**
    - Higher for articles (100+)
    - Lower for reviews/comments (20+)
    - Medium for product descriptions (50+)
 
-3. **Optimize Performance**
+3. **Optimize Performance**
    ```python
    strategy = CosineStrategy(
        word_count_threshold=10,  # Filter early
@@ -182,7 +182,7 @@ async def extract_pricing_features(url: str):
    )
    ```
 
-4. **Handle Different Content Types**
+4. **Handle Different Content Types**
    ```python
    # For mixed content pages
    strategy = CosineStrategy(
diff --git a/docs/md_v2/extraction/css-advanced.md b/docs/md_v2/extraction/css-advanced.md
deleted file mode 100644
index 393b79a5..00000000
--- a/docs/md_v2/extraction/css-advanced.md
+++ /dev/null
@@ -1,282 +0,0 @@
-# Advanced Usage of JsonCssExtractionStrategy
-
-While the basic usage of JsonCssExtractionStrategy is powerful for simple structures, its true potential shines when dealing with complex, nested HTML structures. This section will explore advanced usage scenarios, demonstrating how to extract nested objects, lists, and nested lists.
-
-## Hypothetical Website Example
-
-Let's consider a hypothetical e-commerce website that displays product categories, each containing multiple products. Each product has details, reviews, and related items. This complex structure will allow us to demonstrate various advanced features of JsonCssExtractionStrategy.
-
-Assume the HTML structure looks something like this:
-
-```html
-<div class="category">
-  <h2 class="category-name">Electronics</h2>
-  <div class="product">
-    <h3 class="product-name">Smartphone X</h3>
-    <p class="product-price">$999</p>
-    <div class="product-details">
-      <span class="brand">TechCorp</span>
-      <span class="model">X-2000</span>
-    </div>
-    <ul class="product-features">
-      <li>5G capable</li>
-      <li>6.5" OLED screen</li>
-      <li>128GB storage</li>
-    </ul>
-    <div class="product-reviews">
-      <div class="review">
-        <span class="reviewer">John D.</span>
-        <span class="rating">4.5</span>
-        <p class="review-text">Great phone, love the camera!</p>
-      </div>
-      <div class="review">
-        <span class="reviewer">Jane S.</span>
-        <span class="rating">5</span>
-        <p class="review-text">Best smartphone I've ever owned.</p>
-      </div>
-    </div>
-    <ul class="related-products">
-      <li>
-        <span class="related-name">Phone Case</span>
-        <span class="related-price">$29.99</span>
-      </li>
-      <li>
-        <span class="related-name">Screen Protector</span>
-        <span class="related-price">$9.99</span>
-      </li>
-    </ul>
-  </div>
-  <!-- More products... -->
-</div>
-```
-
-Now, let's create a schema to extract this complex structure:
-
-```python
-schema = {
-    "name": "E-commerce Product Catalog",
-    "baseSelector": "div.category",
-    "fields": [
-        {
-            "name": "category_name",
-            "selector": "h2.category-name",
-            "type": "text"
-        },
-        {
-            "name": "products",
-            "selector": "div.product",
-            "type": "nested_list",
-            "fields": [
-                {
-                    "name": "name",
-                    "selector": "h3.product-name",
-                    "type": "text"
-                },
-                {
-                    "name": "price",
-                    "selector": "p.product-price",
-                    "type": "text"
-                },
-                {
-                    "name": "details",
-                    "selector": "div.product-details",
-                    "type": "nested",
-                    "fields": [
-                        {
-                            "name": "brand",
-                            "selector": "span.brand",
-                            "type": "text"
-                        },
-                        {
-                            "name": "model",
-                            "selector": "span.model",
-                            "type": "text"
-                        }
-                    ]
-                },
-                {
-                    "name": "features",
-                    "selector": "ul.product-features li",
-                    "type": "list",
-                    "fields": [
-                        {
-                            "name": "feature",
-                            "type": "text"
-                        }
-                    ]
-                },
-                {
-                    "name": "reviews",
-                    "selector": "div.review",
-                    "type": "nested_list",
-                    "fields": [
-                        {
-                            "name": "reviewer",
-                            "selector": "span.reviewer",
-                            "type": "text"
-                        },
-                        {
-                            "name": "rating",
-                            "selector": "span.rating",
-                            "type": "text"
-                        },
-                        {
-                            "name": "comment",
-                            "selector": "p.review-text",
-                            "type": "text"
-                        }
-                    ]
-                },
-                {
-                    "name": "related_products",
-                    "selector": "ul.related-products li",
-                    "type": "list",
-                    "fields": [
-                        {
-                            "name": "name",
-                            "selector": "span.related-name",
-                            "type": "text"
-                        },
-                        {
-                            "name": "price",
-                            "selector": "span.related-price",
-                            "type": "text"
-                        }
-                    ]
-                }
-            ]
-        }
-    ]
-}
-```
-
-This schema demonstrates several advanced features:
-
-1. **Nested Objects**: The `details` field is a nested object within each product.
-2. **Simple Lists**: The `features` field is a simple list of text items.
-3. **Nested Lists**: The `products` field is a nested list, where each item is a complex object.
-4. **Lists of Objects**: The `reviews` and `related_products` fields are lists of objects.
-
-Let's break down the key concepts:
-
-### Nested Objects
-
-To create a nested object, use `"type": "nested"` and provide a `fields` array for the nested structure:
-
-```python
-{
-    "name": "details",
-    "selector": "div.product-details",
-    "type": "nested",
-    "fields": [
-        {
-            "name": "brand",
-            "selector": "span.brand",
-            "type": "text"
-        },
-        {
-            "name": "model",
-            "selector": "span.model",
-            "type": "text"
-        }
-    ]
-}
-```
-
-### Simple Lists
-
-For a simple list of identical items, use `"type": "list"`:
-
-```python
-{
-    "name": "features",
-    "selector": "ul.product-features li",
-    "type": "list",
-    "fields": [
-        {
-            "name": "feature",
-            "type": "text"
-        }
-    ]
-}
-```
-
-### Nested Lists
-
-For a list of complex objects, use `"type": "nested_list"`:
-
-```python
-{
-    "name": "products",
-    "selector": "div.product",
-    "type": "nested_list",
-    "fields": [
-        // ... fields for each product
-    ]
-}
-```
-
-### Lists of Objects
-
-Similar to nested lists, but typically used for simpler objects within the list:
-
-```python
-{
-    "name": "related_products",
-    "selector": "ul.related-products li",
-    "type": "list",
-    "fields": [
-        {
-            "name": "name",
-            "selector": "span.related-name",
-            "type": "text"
-        },
-        {
-            "name": "price",
-            "selector": "span.related-price",
-            "type": "text"
-        }
-    ]
-}
-```
-
-## Using the Advanced Schema
-
-To use this advanced schema with AsyncWebCrawler:
-
-```python
-import json
-import asyncio
-from crawl4ai import AsyncWebCrawler
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-async def extract_complex_product_data():
-    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
-            extraction_strategy=extraction_strategy,
-            bypass_cache=True,
-        )
-
-        assert result.success, "Failed to crawl the page"
-
-        product_data = json.loads(result.extracted_content)
-        print(json.dumps(product_data, indent=2))
-
-asyncio.run(extract_complex_product_data())
-```
-
-This will produce a structured JSON output that captures the complex hierarchy of the product catalog, including nested objects, lists, and nested lists.
-
-## Tips for Advanced Usage
-
-1. **Start Simple**: Begin with a basic schema and gradually add complexity.
-2. **Test Incrementally**: Test each part of your schema separately before combining them.
-3. **Use Chrome DevTools**: The Element Inspector is invaluable for identifying the correct selectors.
-4. **Handle Missing Data**: Use the `default` key in your field definitions to handle cases where data might be missing.
-5. **Leverage Transforms**: Use the `transform` key to clean or format extracted data (e.g., converting prices to numbers).
-6. **Consider Performance**: Very complex schemas might slow down extraction. Balance complexity with performance needs.
-
-By mastering these advanced techniques, you can use JsonCssExtractionStrategy to extract highly structured data from even the most complex web pages, making it a powerful tool for web scraping and data analysis tasks.
\ No newline at end of file
diff --git a/docs/md_v2/extraction/css.md b/docs/md_v2/extraction/css.md
deleted file mode 100644
index 3b5075a6..00000000
--- a/docs/md_v2/extraction/css.md
+++ /dev/null
@@ -1,142 +0,0 @@
-# JSON CSS Extraction Strategy with AsyncWebCrawler
-
-The `JsonCssExtractionStrategy` is a powerful feature of Crawl4AI that allows you to extract structured data from web pages using CSS selectors. This method is particularly useful when you need to extract specific data points from a consistent HTML structure, such as tables or repeated elements. Here's how to use it with the AsyncWebCrawler.
-
-## Overview
-
-The `JsonCssExtractionStrategy` works by defining a schema that specifies:
-1. A base CSS selector for the repeating elements
-2. Fields to extract from each element, each with its own CSS selector
-
-This strategy is fast and efficient, as it doesn't rely on external services like LLMs for extraction.
-
-## Example: Extracting Cryptocurrency Prices from Coinbase
-
-Let's look at an example that extracts cryptocurrency prices from the Coinbase explore page.
-
-```python
-import json
-import asyncio
-from crawl4ai import AsyncWebCrawler
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-async def extract_structured_data_using_css_extractor():
-    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
-    
-    # Define the extraction schema
-    schema = {
-        "name": "Coinbase Crypto Prices",
-        "baseSelector": ".cds-tableRow-t45thuk",
-        "fields": [
-            {
-                "name": "crypto",
-                "selector": "td:nth-child(1) h2",
-                "type": "text",
-            },
-            {
-                "name": "symbol",
-                "selector": "td:nth-child(1) p",
-                "type": "text",
-            },
-            {
-                "name": "price",
-                "selector": "td:nth-child(2)",
-                "type": "text",
-            }
-        ],
-    }
-
-    # Create the extraction strategy
-    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-    # Use the AsyncWebCrawler with the extraction strategy
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.coinbase.com/explore",
-            extraction_strategy=extraction_strategy,
-            bypass_cache=True,
-        )
-
-        assert result.success, "Failed to crawl the page"
-
-        # Parse the extracted content
-        crypto_prices = json.loads(result.extracted_content)
-        print(f"Successfully extracted {len(crypto_prices)} cryptocurrency prices")
-        print(json.dumps(crypto_prices[0], indent=2))
-
-    return crypto_prices
-
-# Run the async function
-asyncio.run(extract_structured_data_using_css_extractor())
-```
-
-## Explanation of the Schema
-
-The schema defines how to extract the data:
-
-- `name`: A descriptive name for the extraction task.
-- `baseSelector`: The CSS selector for the repeating elements (in this case, table rows).
-- `fields`: An array of fields to extract from each element:
-  - `name`: The name to give the extracted data.
-  - `selector`: The CSS selector to find the specific data within the base element.
-  - `type`: The type of data to extract (usually "text" for textual content).
-
-## Advantages of JsonCssExtractionStrategy
-
-1. **Speed**: CSS selectors are fast to execute, making this method efficient for large datasets.
-2. **Precision**: You can target exactly the elements you need.
-3. **Structured Output**: The result is already structured as JSON, ready for further processing.
-4. **No External Dependencies**: Unlike LLM-based strategies, this doesn't require any API calls to external services.
-
-## Tips for Using JsonCssExtractionStrategy
-
-1. **Inspect the Page**: Use browser developer tools to identify the correct CSS selectors.
-2. **Test Selectors**: Verify your selectors in the browser console before using them in the script.
-3. **Handle Dynamic Content**: If the page uses JavaScript to load content, you may need to combine this with JS execution (see the Advanced Usage section).
-4. **Error Handling**: Always check the `result.success` flag and handle potential failures.
-
-## Advanced Usage: Combining with JavaScript Execution
-
-For pages that load data dynamically, you can combine the `JsonCssExtractionStrategy` with JavaScript execution:
-
-```python
-async def extract_dynamic_structured_data():
-    schema = {
-        "name": "Dynamic Crypto Prices",
-        "baseSelector": ".crypto-row",
-        "fields": [
-            {"name": "name", "selector": ".crypto-name", "type": "text"},
-            {"name": "price", "selector": ".crypto-price", "type": "text"},
-        ]
-    }
-
-    js_code = """
-    window.scrollTo(0, document.body.scrollHeight);
-    await new Promise(resolve => setTimeout(resolve, 2000));  // Wait for 2 seconds
-    """
-
-    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://example.com/crypto-prices",
-            extraction_strategy=extraction_strategy,
-            js_code=js_code,
-            wait_for=".crypto-row:nth-child(20)",  # Wait for 20 rows to load
-            bypass_cache=True,
-        )
-
-        crypto_data = json.loads(result.extracted_content)
-        print(f"Extracted {len(crypto_data)} cryptocurrency entries")
-
-asyncio.run(extract_dynamic_structured_data())
-```
-
-This advanced example demonstrates how to:
-1. Execute JavaScript to trigger dynamic content loading.
-2. Wait for a specific condition (20 rows loaded) before extraction.
-3. Extract data from the dynamically loaded content.
-
-By mastering the `JsonCssExtractionStrategy`, you can efficiently extract structured data from a wide variety of web pages, making it a valuable tool in your web scraping toolkit.
-
-For more details on schema definitions and advanced extraction strategies, check out the[Advanced JsonCssExtraction](./css-advanced.md).
\ No newline at end of file
diff --git a/docs/md_v3/tutorials/json-extraction-llm.md b/docs/md_v2/extraction/llm-strategies.md
similarity index 70%
rename from docs/md_v3/tutorials/json-extraction-llm.md
rename to docs/md_v2/extraction/llm-strategies.md
index 5b9369d9..dc2dba1a 100644
--- a/docs/md_v3/tutorials/json-extraction-llm.md
+++ b/docs/md_v2/extraction/llm-strategies.md
@@ -1,7 +1,3 @@
-Below is a **draft** of the **Extracting JSON (LLM)** tutorial, illustrating how to use large language models for structured data extraction in Crawl4AI. It highlights key parameters (like chunking, overlap, instruction, schema) and explains how the system remains **provider-agnostic** via LightLLM. Adjust field names or code snippets to match your repository’s specifics.
-
----
-
 # Extracting JSON (LLM)
 
 In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:
@@ -10,7 +6,7 @@ In some cases, you need to extract **complex or unstructured** information from
 2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.  
 3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.
 
-**Important**: LLM-based extraction can be slower and costlier than schema-based approaches. If your page data is highly structured, consider using [`JsonCssExtractionStrategy`](./json-extraction-basic.md) or [`JsonXPathExtractionStrategy`](./json-extraction-basic.md) first. But if you need AI to interpret or reorganize content, read on!
+**Important**: LLM-based extraction can be slower and costlier than schema-based approaches. If your page data is highly structured, consider using [`JsonCssExtractionStrategy`](./no-llm-strategies.md) or [`JsonXPathExtractionStrategy`](./no-llm-strategies.md) first. But if you need AI to interpret or reorganize content, read on!
 
 ---
 
@@ -24,7 +20,7 @@ In some cases, you need to extract **complex or unstructured** information from
 
 ## 2. Provider-Agnostic via LightLLM
 
-Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide:
+Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide:
 
 - **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).  
 - **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.  
@@ -38,10 +34,10 @@ This means you **aren’t locked** into a single LLM vendor. Switch or experimen
 
 ### 3.1 Flow
 
-1. **Chunking** (optional): The HTML or markdown is split into smaller segments if it’s very long (based on `chunk_token_threshold`, overlap, etc.).  
-2. **Prompt Construction**: For each chunk, the library forms a prompt that includes your **`instruction`** (and possibly schema or examples).  
-3. **LLM Inference**: Each chunk is sent to the model in parallel or sequentially (depending on your concurrency).  
-4. **Combining**: The results from each chunk are merged and parsed into JSON.
+1. **Chunking** (optional): The HTML or markdown is split into smaller segments if it’s very long (based on `chunk_token_threshold`, overlap, etc.).  
+2. **Prompt Construction**: For each chunk, the library forms a prompt that includes your **`instruction`** (and possibly schema or examples).  
+3. **LLM Inference**: Each chunk is sent to the model in parallel or sequentially (depending on your concurrency).  
+4. **Combining**: The results from each chunk are merged and parsed into JSON.
 
 ### 3.2 `extraction_type`
 
@@ -56,20 +52,20 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic
 
 Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
 
-1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.  
-2. **`api_token`** (str): The API key or token for that model. May not be needed for local models.  
-3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
-4. **`extraction_type`** (str): `"schema"` or `"block"`.  
-5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
-6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.  
-7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.  
-8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.  
-9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:  
+1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.  
+2. **`api_token`** (str): The API key or token for that model. May not be needed for local models.  
+3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
+4. **`extraction_type`** (str): `"schema"` or `"block"`.  
+5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
+6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.  
+7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.  
+8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.  
+9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:  
    - `"markdown"`: The raw markdown (default).  
    - `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.  
    - `"html"`: The cleaned or raw HTML.  
-10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.  
-11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).  
+10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.  
+11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).  
 
 **Example**:
 
@@ -159,7 +155,7 @@ if __name__ == "__main__":
 
 ### 6.1 `chunk_token_threshold`
 
-If your page is large, you might exceed your LLM’s context window. **`chunk_token_threshold`** sets the approximate max tokens per chunk. The library calculates word→token ratio using `word_token_rate` (often ~0.75 by default). If chunking is enabled (`apply_chunking=True`), the text is split into segments.
+If your page is large, you might exceed your LLM’s context window. **`chunk_token_threshold`** sets the approximate max tokens per chunk. The library calculates word→token ratio using `word_token_rate` (often ~0.75 by default). If chunking is enabled (`apply_chunking=True`), the text is split into segments.
 
 ### 6.2 `overlap_rate`
 
@@ -281,12 +277,12 @@ if __name__ == "__main__":
 
 ## 10. Best Practices & Caveats
 
-1. **Cost & Latency**: LLM calls can be slow or expensive. Consider chunking or smaller coverage if you only need partial data.  
-2. **Model Token Limits**: If your page + instruction exceed the context window, chunking is essential.  
-3. **Instruction Engineering**: Well-crafted instructions can drastically improve output reliability.  
-4. **Schema Strictness**: `"schema"` extraction tries to parse the model output as JSON. If the model returns invalid JSON, partial extraction might happen, or you might get an error.  
-5. **Parallel vs. Serial**: The library can process multiple chunks in parallel, but you must watch out for rate limits on certain providers.  
-6. **Check Output**: Sometimes, an LLM might omit fields or produce extraneous text. You may want to post-validate with Pydantic or do additional cleanup.
+1. **Cost & Latency**: LLM calls can be slow or expensive. Consider chunking or smaller coverage if you only need partial data.  
+2. **Model Token Limits**: If your page + instruction exceed the context window, chunking is essential.  
+3. **Instruction Engineering**: Well-crafted instructions can drastically improve output reliability.  
+4. **Schema Strictness**: `"schema"` extraction tries to parse the model output as JSON. If the model returns invalid JSON, partial extraction might happen, or you might get an error.  
+5. **Parallel vs. Serial**: The library can process multiple chunks in parallel, but you must watch out for rate limits on certain providers.  
+6. **Check Output**: Sometimes, an LLM might omit fields or produce extraneous text. You may want to post-validate with Pydantic or do additional cleanup.
 
 ---
 
@@ -299,35 +295,27 @@ if __name__ == "__main__":
 - Tweak **`chunk_token_threshold`**, **`overlap_rate`**, and **`apply_chunking`** to handle large content efficiently.  
 - Monitor token usage with `show_usage()`.
 
-If your site’s data is consistent or repetitive, consider [`JsonCssExtractionStrategy`](./json-extraction-basic.md) first for speed and simplicity. But if you need an **AI-driven** approach, `LLMExtractionStrategy` offers a flexible, multi-provider solution for extracting structured JSON from any website.
+If your site’s data is consistent or repetitive, consider [`JsonCssExtractionStrategy`](./no-llm-strategies.md) first for speed and simplicity. But if you need an **AI-driven** approach, `LLMExtractionStrategy` offers a flexible, multi-provider solution for extracting structured JSON from any website.
 
 **Next Steps**:
 
-1. **Experiment with Different Providers**  
+1. **Experiment with Different Providers**  
    - Try switching the `provider` (e.g., `"ollama/llama2"`, `"openai/gpt-4o"`, etc.) to see differences in speed, accuracy, or cost.  
    - Pass different `extra_args` like `temperature`, `top_p`, and `max_tokens` to fine-tune your results.
 
-2. **Combine With Other Strategies**  
-   - Use [content filters](../../how-to/content-filters.md) like BM25 or Pruning prior to LLM extraction to remove noise and reduce token usage.  
-   - Apply a [CSS or XPath extraction strategy](./json-extraction-basic.md) first for obvious, structured data, then send only the tricky parts to the LLM.
-
-3. **Performance Tuning**  
+2. **Performance Tuning**  
    - If pages are large, tweak `chunk_token_threshold`, `overlap_rate`, or `apply_chunking` to optimize throughput.  
    - Check the usage logs with `show_usage()` to keep an eye on token consumption and identify potential bottlenecks.
 
-4. **Validate Outputs**  
+3. **Validate Outputs**  
    - If using `extraction_type="schema"`, parse the LLM’s JSON with a Pydantic model for a final validation step.  
    - Log or handle any parse errors gracefully, especially if the model occasionally returns malformed JSON.
 
-5. **Explore Hooks & Automation**  
-   - Integrate LLM extraction with [hooks](./hooks-custom.md) for complex pre/post-processing.  
+4. **Explore Hooks & Automation**  
+   - Integrate LLM extraction with [hooks](../advanced/hooks-auth.md) for complex pre/post-processing.  
    - Use a multi-step pipeline: crawl, filter, LLM-extract, then store or index results for further analysis.
 
-6. **Scale and Deploy**  
-   - Combine your LLM extraction setup with [Docker or other deployment solutions](./docker-quickstart.md) to run at scale.  
-   - Monitor memory usage and concurrency if you call LLMs frequently.
-
-**Last Updated**: 2024-XX-XX
+**Last Updated**: 2025-01-01
 
 ---
 
diff --git a/docs/md_v2/extraction/llm.md b/docs/md_v2/extraction/llm.md
deleted file mode 100644
index ca562147..00000000
--- a/docs/md_v2/extraction/llm.md
+++ /dev/null
@@ -1,179 +0,0 @@
-# LLM Extraction with AsyncWebCrawler
-
-Crawl4AI's AsyncWebCrawler allows you to use Language Models (LLMs) to extract structured data or relevant content from web pages asynchronously. Below are two examples demonstrating how to use `LLMExtractionStrategy` for different purposes with the AsyncWebCrawler.
-
-## Example 1: Extract Structured Data
-
-In this example, we use the `LLMExtractionStrategy` to extract structured data (model names and their fees) from the OpenAI pricing page.
-
-```python
-import os
-import json
-import asyncio
-from crawl4ai import AsyncWebCrawler
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
-from pydantic import BaseModel, Field
-
-class OpenAIModelFee(BaseModel):
-    model_name: str = Field(..., description="Name of the OpenAI model.")
-    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
-
-async def extract_openai_fees():
-    url = 'https://openai.com/api/pricing/'
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url=url,
-            word_count_threshold=1,
-            extraction_strategy=LLMExtractionStrategy(
-                provider="openai/gpt-4o", # Or use ollama like provider="ollama/nemotron"
-                api_token=os.getenv('OPENAI_API_KEY'),
-                schema=OpenAIModelFee.model_json_schema(),
-                extraction_type="schema",
-                instruction="From the crawled content, extract all mentioned model names along with their "
-                            "fees for input and output tokens. Make sure not to miss anything in the entire content. "
-                            'One extracted model JSON format should look like this: '
-                            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
-            ),
-            bypass_cache=True,
-        )
-
-    model_fees = json.loads(result.extracted_content)
-    print(f"Number of models extracted: {len(model_fees)}")
-
-    with open(".data/openai_fees.json", "w", encoding="utf-8") as f:
-        json.dump(model_fees, f, indent=2)
-
-asyncio.run(extract_openai_fees())
-```
-
-## Example 2: Extract Relevant Content
-
-In this example, we instruct the LLM to extract only content related to technology from the NBC News business page.
-
-```python
-import os
-import json
-import asyncio
-from crawl4ai import AsyncWebCrawler
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
-
-async def extract_tech_content():
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            extraction_strategy=LLMExtractionStrategy(
-                provider="openai/gpt-4o",
-                api_token=os.getenv('OPENAI_API_KEY'),
-                instruction="Extract only content related to technology"
-            ),
-            bypass_cache=True,
-        )
-
-    tech_content = json.loads(result.extracted_content)
-    print(f"Number of tech-related items extracted: {len(tech_content)}")
-
-    with open(".data/tech_content.json", "w", encoding="utf-8") as f:
-        json.dump(tech_content, f, indent=2)
-
-asyncio.run(extract_tech_content())
-```
-
-## Advanced Usage: Combining JS Execution with LLM Extraction
-
-This example demonstrates how to combine JavaScript execution with LLM extraction to handle dynamic content:
-
-```python
-async def extract_dynamic_content():
-    js_code = """
-    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-    if (loadMoreButton) {
-        loadMoreButton.click();
-        await new Promise(resolve => setTimeout(resolve, 2000));
-    }
-    """
-
-    wait_for = """
-    () => {
-        const articles = document.querySelectorAll('article.tease-card');
-        return articles.length > 10;
-    }
-    """
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            js_code=js_code,
-            wait_for=wait_for,
-            css_selector="article.tease-card",
-            extraction_strategy=LLMExtractionStrategy(
-                provider="openai/gpt-4o",
-                api_token=os.getenv('OPENAI_API_KEY'),
-                instruction="Summarize each article, focusing on technology-related content"
-            ),
-            bypass_cache=True,
-        )
-
-    summaries = json.loads(result.extracted_content)
-    print(f"Number of summarized articles: {len(summaries)}")
-
-    with open(".data/tech_summaries.json", "w", encoding="utf-8") as f:
-        json.dump(summaries, f, indent=2)
-
-asyncio.run(extract_dynamic_content())
-```
-
-## Customizing LLM Provider
-
-Crawl4AI uses the `litellm` library under the hood, which allows you to use any LLM provider you want. Just pass the correct model name and API token:
-
-```python
-extraction_strategy=LLMExtractionStrategy(
-    provider="your_llm_provider/model_name",
-    api_token="your_api_token",
-    instruction="Your extraction instruction"
-)
-```
-
-This flexibility allows you to integrate with various LLM providers and tailor the extraction process to your specific needs.
-
-## Error Handling and Retries
-
-When working with external LLM APIs, it's important to handle potential errors and implement retry logic. Here's an example of how you might do this:
-
-```python
-import asyncio
-from tenacity import retry, stop_after_attempt, wait_exponential
-
-class LLMExtractionError(Exception):
-    pass
-
-@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
-async def extract_with_retry(crawler, url, extraction_strategy):
-    try:
-        result = await crawler.arun(url=url, extraction_strategy=extraction_strategy, bypass_cache=True)
-        return json.loads(result.extracted_content)
-    except Exception as e:
-        raise LLMExtractionError(f"Failed to extract content: {str(e)}")
-
-async def main():
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        try:
-            content = await extract_with_retry(
-                crawler,
-                "https://www.example.com",
-                LLMExtractionStrategy(
-                    provider="openai/gpt-4o",
-                    api_token=os.getenv('OPENAI_API_KEY'),
-                    instruction="Extract and summarize main points"
-                )
-            )
-            print("Extracted content:", content)
-        except LLMExtractionError as e:
-            print(f"Extraction failed after retries: {e}")
-
-asyncio.run(main())
-```
-
-This example uses the `tenacity` library to implement a retry mechanism with exponential backoff, which can help handle temporary failures or rate limiting from the LLM API.
\ No newline at end of file
diff --git a/docs/md_v3/tutorials/json-extraction-basic.md b/docs/md_v2/extraction/no-llm-strategies.md
similarity index 63%
rename from docs/md_v3/tutorials/json-extraction-basic.md
rename to docs/md_v2/extraction/no-llm-strategies.md
index 1a9b79e6..97002dad 100644
--- a/docs/md_v3/tutorials/json-extraction-basic.md
+++ b/docs/md_v2/extraction/no-llm-strategies.md
@@ -4,10 +4,10 @@ One of Crawl4AI’s **most powerful** features is extracting **structured JSON**
 
 **Why avoid LLM for basic extractions?**
 
-1. **Faster & Cheaper**: No API calls or GPU overhead.  
-2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. A well-defined schema is practically carbon-free.  
-3. **Precise & Repeatable**: CSS/XPath selectors do exactly what you specify. LLM outputs can vary or hallucinate.  
-4. **Scales Readily**: For thousands of pages, schema-based extraction runs quickly and in parallel.
+1. **Faster & Cheaper**: No API calls or GPU overhead.  
+2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. A well-defined schema is practically carbon-free.  
+3. **Precise & Repeatable**: CSS/XPath selectors do exactly what you specify. LLM outputs can vary or hallucinate.  
+4. **Scales Readily**: For thousands of pages, schema-based extraction runs quickly and in parallel.
 
 Below, we’ll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We’ll also highlight advanced features like **nested fields** and **base element attributes**.
 
@@ -18,8 +18,8 @@ Below, we’ll explore how to craft these schemas and use them with **JsonCssExt
 A schema defines:
 
 1. A **base selector** that identifies each “container” element on the page (e.g., a product row, a blog post card).  
-2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).  
-3. **Nested** or **list** types for repeated or hierarchical structures.  
+2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).  
+3. **Nested** or **list** types for repeated or hierarchical structures.  
 
 For example, if you have a list of products, each one might have a name, price, reviews, and “related products.” This approach is faster and more reliable than an LLM for consistent, structured pages.
 
@@ -168,9 +168,9 @@ asyncio.run(extract_crypto_prices_xpath())
 
 **Key Points**:
 
-1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.  
-2. **`baseSelector`** and each field’s `"selector"` use **XPath** instead of CSS.  
-3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.  
+1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.  
+2. **`baseSelector`** and each field’s `"selector"` use **XPath** instead of CSS.  
+3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.  
 4. Everything (including the extraction strategy) is in **`CrawlerRunConfig`**.  
 
 That’s how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`.
@@ -193,7 +193,8 @@ This snippet includes categories, products, features, reviews, and related items
 schema = {
     "name": "E-commerce Product Catalog",
     "baseSelector": "div.category",
-    # (1) We can define optional baseFields if we want to extract attributes from the category container
+    # (1) We can define optional baseFields if we want to extract attributes 
+    # from the category container
     "baseFields": [
         {"name": "data_cat_id", "type": "attribute", "attribute": "data-cat-id"}, 
     ],
@@ -223,8 +224,16 @@ schema = {
                     "selector": "div.product-details",
                     "type": "nested",  # single sub-object
                     "fields": [
-                        {"name": "brand", "selector": "span.brand", "type": "text"},
-                        {"name": "model", "selector": "span.model", "type": "text"}
+                        {
+                            "name": "brand",
+                            "selector": "span.brand",
+                            "type": "text"
+                        },
+                        {
+                            "name": "model",
+                            "selector": "span.model",
+                            "type": "text"
+                        }
                     ]
                 },
                 {
@@ -240,9 +249,21 @@ schema = {
                     "selector": "div.review",
                     "type": "nested_list",
                     "fields": [
-                        {"name": "reviewer", "selector": "span.reviewer", "type": "text"},
-                        {"name": "rating", "selector": "span.rating", "type": "text"},
-                        {"name": "comment", "selector": "p.review-text", "type": "text"}
+                        {
+                            "name": "reviewer", 
+                            "selector": "span.reviewer", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "rating", 
+                            "selector": "span.rating", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "comment", 
+                            "selector": "p.review-text", 
+                            "type": "text"
+                        }
                     ]
                 },
                 {
@@ -250,8 +271,16 @@ schema = {
                     "selector": "ul.related-products li",
                     "type": "list",
                     "fields": [
-                        {"name": "name", "selector": "span.related-name", "type": "text"},
-                        {"name": "price", "selector": "span.related-price", "type": "text"}
+                        {
+                            "name": "name", 
+                            "selector": "span.related-name", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "price", 
+                            "selector": "span.related-price", 
+                            "type": "text"
+                        }
                     ]
                 }
             ]
@@ -310,10 +339,10 @@ If all goes well, you get a **structured** JSON array with each “category,”
 
 ## 4. Why “No LLM” Is Often Better
 
-1. **Zero Hallucination**: Schema-based extraction doesn’t guess text. It either finds it or not.  
-2. **Guaranteed Structure**: The same schema yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.  
-3. **Speed**: LLM-based extraction can be 10–1000x slower for large-scale crawling.  
-4. **Scalable**: Adding or updating a field is a matter of adjusting the schema, not re-tuning a model.
+1. **Zero Hallucination**: Schema-based extraction doesn’t guess text. It either finds it or not.  
+2. **Guaranteed Structure**: The same schema yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.  
+3. **Speed**: LLM-based extraction can be 10–1000x slower for large-scale crawling.  
+4. **Scalable**: Adding or updating a field is a matter of adjusting the schema, not re-tuning a model.
 
 **When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema approach first for repeated or consistent data patterns.
 
@@ -362,17 +391,102 @@ Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post o
 
 ## 7. Tips & Best Practices
 
-1. **Inspect the DOM** in Chrome DevTools or Firefox’s Inspector to find stable selectors.  
-2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.  
-3. **Test** your schema on partial HTML or a test page before a big crawl.  
-4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.  
-5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it’ll often show warnings.  
-6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the “parent” item.  
-7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
+1. **Inspect the DOM** in Chrome DevTools or Firefox’s Inspector to find stable selectors.  
+2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.  
+3. **Test** your schema on partial HTML or a test page before a big crawl.  
+4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.  
+5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it’ll often show warnings.  
+6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the “parent” item.  
+7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
 
 ---
 
-## 8. Conclusion
+## 8. Schema Generation Utility
+
+While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when:
+
+1. You're dealing with a new website structure and want a quick starting point
+2. You need to extract complex nested data structures
+3. You want to avoid the learning curve of CSS/XPath selector syntax
+
+### Using the Schema Generator
+
+The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+
+# Sample HTML with product information
+html = """
+<div class="product-card">
+    <h2 class="title">Gaming Laptop</h2>
+    <div class="price">$999.99</div>
+    <div class="specs">
+        <ul>
+            <li>16GB RAM</li>
+            <li>1TB SSD</li>
+        </ul>
+    </div>
+</div>
+"""
+
+# Option 1: Using OpenAI (requires API token)
+css_schema = JsonCssExtractionStrategy.generate_schema(
+    html,
+    schema_type="css",  # This is the default
+    llm_provider="openai/gpt-4o",  # Default provider
+    api_token="your-openai-token"  # Required for OpenAI
+)
+
+# Option 2: Using Ollama (open source, no token needed)
+xpath_schema = JsonXPathExtractionStrategy.generate_schema(
+    html,
+    schema_type="xpath",
+    llm_provider="ollama/llama3.3",  # Open source alternative
+    api_token=None  # Not needed for Ollama
+)
+
+# Use the generated schema for fast, repeated extractions
+strategy = JsonCssExtractionStrategy(css_schema)
+```
+
+### LLM Provider Options
+
+1. **OpenAI GPT-4 (`openai/gpt4o`)**
+   - Default provider
+   - Requires an API token
+   - Generally provides more accurate schemas
+   - Set via environment variable: `OPENAI_API_KEY`
+
+2. **Ollama (`ollama/llama3.3`)**
+   - Open source alternative
+   - No API token required
+   - Self-hosted option
+   - Good for development and testing
+
+### Benefits of Schema Generation
+
+1. **One-Time Cost**: While schema generation uses LLM, it's a one-time cost. The generated schema can be reused for unlimited extractions without further LLM calls.
+2. **Smart Pattern Recognition**: The LLM analyzes the HTML structure and identifies common patterns, often producing more robust selectors than manual attempts.
+3. **Automatic Nesting**: Complex nested structures are automatically detected and properly represented in the schema.
+4. **Learning Tool**: The generated schemas serve as excellent examples for learning how to write your own schemas.
+
+### Best Practices
+
+1. **Review Generated Schemas**: While the generator is smart, always review and test the generated schema before using it in production.
+2. **Provide Representative HTML**: The better your sample HTML represents the overall structure, the more accurate the generated schema will be.
+3. **Consider Both CSS and XPath**: Try both schema types and choose the one that works best for your specific case.
+4. **Cache Generated Schemas**: Since generation uses LLM, save successful schemas for reuse.
+5. **API Token Security**: Never hardcode API tokens. Use environment variables or secure configuration management.
+6. **Choose Provider Wisely**: 
+   - Use OpenAI for production-quality schemas
+   - Use Ollama for development, testing, or when you need a self-hosted solution
+
+That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
+
+---
+
+## 9. Conclusion
 
 With **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy**), you can build powerful, **LLM-free** pipelines that:
 
@@ -382,13 +496,12 @@ With **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy**), you can
 
 **Next Steps**:
 
-- Explore the [Advanced Usage of JSON Extraction](../../explanations/extraction-chunking.md) for deeper details on schema nesting, transformations, or hooking.  
 - Combine your extracted JSON with advanced filtering or summarization in a second pass if needed.  
 - For dynamic pages, combine strategies with `js_code` or infinite scroll hooking to ensure all content is loaded.
 
 **Remember**: For repeated, structured data, you don’t need to pay for or wait on an LLM. A well-crafted schema plus CSS or XPath gets you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI.
 
-**Last Updated**: 2024-XX-XX
+**Last Updated**: 2025-01-01
 
 ---
 
diff --git a/docs/md_v2/extraction/overview.md b/docs/md_v2/extraction/overview.md
deleted file mode 100644
index 7c524475..00000000
--- a/docs/md_v2/extraction/overview.md
+++ /dev/null
@@ -1,226 +0,0 @@
-# Extraction Strategies Overview
-
-Crawl4AI provides powerful extraction strategies to help you get structured data from web pages. Each strategy is designed for specific use cases and offers different approaches to data extraction.
-
-## Available Strategies
-
-### [LLM-Based Extraction](llm.md)
-
-`LLMExtractionStrategy` uses Language Models to extract structured data from web content. This approach is highly flexible and can understand content semantically.
-
-```python
-from pydantic import BaseModel
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
-
-class Product(BaseModel):
-    name: str
-    price: float
-    description: str
-
-strategy = LLMExtractionStrategy(
-    provider="ollama/llama2",
-    schema=Product.schema(),
-    instruction="Extract product details from the page"
-)
-
-result = await crawler.arun(
-    url="https://example.com/product",
-    extraction_strategy=strategy
-)
-```
-
-**Best for:**
-- Complex data structures
-- Content requiring interpretation
-- Flexible content formats
-- Natural language processing
-
-### [CSS-Based Extraction](css.md)
-
-`JsonCssExtractionStrategy` extracts data using CSS selectors. This is fast, reliable, and perfect for consistently structured pages.
-
-```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-schema = {
-    "name": "Product Listing",
-    "baseSelector": ".product-card",
-    "fields": [
-        {"name": "title", "selector": "h2", "type": "text"},
-        {"name": "price", "selector": ".price", "type": "text"},
-        {"name": "image", "selector": "img", "type": "attribute", "attribute": "src"}
-    ]
-}
-
-strategy = JsonCssExtractionStrategy(schema)
-
-result = await crawler.arun(
-    url="https://example.com/products",
-    extraction_strategy=strategy
-)
-```
-
-**Best for:**
-- E-commerce product listings
-- News article collections
-- Structured content pages
-- High-performance needs
-
-### [Cosine Strategy](cosine.md)
-
-`CosineStrategy` uses similarity-based clustering to identify and extract relevant content sections.
-
-```python
-from crawl4ai.extraction_strategy import CosineStrategy
-
-strategy = CosineStrategy(
-    semantic_filter="product reviews",    # Content focus
-    word_count_threshold=10,             # Minimum words per cluster
-    sim_threshold=0.3,                   # Similarity threshold
-    max_dist=0.2,                        # Maximum cluster distance
-    top_k=3                             # Number of top clusters to extract
-)
-
-result = await crawler.arun(
-    url="https://example.com/reviews",
-    extraction_strategy=strategy
-)
-```
-
-**Best for:**
-- Content similarity analysis
-- Topic clustering
-- Relevant content extraction
-- Pattern recognition in text
-
-## Strategy Selection Guide
-
-Choose your strategy based on these factors:
-
-1. **Content Structure**
-   - Well-structured HTML → Use CSS Strategy
-   - Natural language text → Use LLM Strategy
-   - Mixed/Complex content → Use Cosine Strategy
-
-2. **Performance Requirements**
-   - Fastest: CSS Strategy
-   - Moderate: Cosine Strategy
-   - Variable: LLM Strategy (depends on provider)
-
-3. **Accuracy Needs**
-   - Highest structure accuracy: CSS Strategy
-   - Best semantic understanding: LLM Strategy
-   - Best content relevance: Cosine Strategy
-
-## Combining Strategies
-
-You can combine strategies for more powerful extraction:
-
-```python
-# First use CSS strategy for initial structure
-css_result = await crawler.arun(
-    url="https://example.com",
-    extraction_strategy=css_strategy
-)
-
-# Then use LLM for semantic analysis
-llm_result = await crawler.arun(
-    url="https://example.com",
-    extraction_strategy=llm_strategy
-)
-```
-
-## Common Use Cases
-
-1. **E-commerce Scraping**
-   ```python
-   # CSS Strategy for product listings
-   schema = {
-       "name": "Products",
-       "baseSelector": ".product",
-       "fields": [
-           {"name": "name", "selector": ".title", "type": "text"},
-           {"name": "price", "selector": ".price", "type": "text"}
-       ]
-   }
-   ```
-
-2. **News Article Extraction**
-   ```python
-   # LLM Strategy for article content
-   class Article(BaseModel):
-       title: str
-       content: str
-       author: str
-       date: str
-
-   strategy = LLMExtractionStrategy(
-       provider="ollama/llama2",
-       schema=Article.schema()
-   )
-   ```
-
-3. **Content Analysis**
-   ```python
-   # Cosine Strategy for topic analysis
-   strategy = CosineStrategy(
-       semantic_filter="technology trends",
-       top_k=5
-   )
-   ```
-
-
-## Input Formats
-All extraction strategies support different input formats to give you more control over how content is processed:
-
-- **markdown** (default): Uses the raw markdown conversion of the HTML content. Best for general text extraction where HTML structure isn't critical.
-- **html**: Uses the raw HTML content. Useful when you need to preserve HTML structure or extract data from specific HTML elements.
-- **fit_markdown**: Uses the cleaned and filtered markdown content. Best for extracting relevant content while removing noise. Requires a markdown generator with content filter to be configured.
-
-To specify an input format:
-```python
-strategy = LLMExtractionStrategy(
-    input_format="html",  # or "markdown" or "fit_markdown"
-    provider="openai/gpt-4",
-    instruction="Extract product information"
-)
-```
-
-Note: When using "fit_markdown", ensure your CrawlerRunConfig includes a markdown generator with content filter:
-```python
-config = CrawlerRunConfig(
-    extraction_strategy=strategy,
-    markdown_generator=DefaultMarkdownGenerator(
-        content_filter=PruningContentFilter()  # Content filter goes here for fit_markdown
-    )
-)
-```
-
-If fit_markdown is requested but not available (no markdown generator or content filter), the system will automatically fall back to raw markdown with a warning.
-
-## Best Practices
-
-1. **Choose the Right Strategy**
-   - Start with CSS for structured data
-   - Use LLM for complex interpretation
-   - Try Cosine for content relevance
-
-2. **Optimize Performance**
-   - Cache LLM results
-   - Keep CSS selectors specific
-   - Tune similarity thresholds
-
-3. **Handle Errors**
-   ```python
-   result = await crawler.arun(
-       url="https://example.com",
-       extraction_strategy=strategy
-   )
-   
-   if not result.success:
-       print(f"Extraction failed: {result.error_message}")
-   else:
-       data = json.loads(result.extracted_content)
-   ```
-
-Each strategy has its strengths and optimal use cases. Explore the detailed documentation for each strategy to learn more about their specific features and configurations.
\ No newline at end of file
diff --git a/docs/md_v2/index.md b/docs/md_v2/index.md
index 65ea6da8..7a230d5d 100644
--- a/docs/md_v2/index.md
+++ b/docs/md_v2/index.md
@@ -1,10 +1,53 @@
-# Crawl4AI
+# 🚀🤖 Crawl4AI: Open-Source LLM-Friendly Web Crawler & Scraper
 
-Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
+<div class = "badges" align="center">
 
-## Introduction
+  <p>
+    <a href="https://trendshift.io/repositories/11716" target="_blank">
+      <img src="https://trendshift.io/api/badge/repositories/11716"
+           alt="unclecode%2Fcrawl4ai | Trendshift"
+           style="width: 250px; height: 55px;"
+           width="250" height="55"/>
+    </a>
+
+  </p>
+
+  <p>
+    <a href="https://github.com/unclecode/crawl4ai/stargazers">
+      <img src="https://img.shields.io/github/stars/unclecode/crawl4ai?style=social"
+           alt="GitHub Stars"/>
+    </a>
+    <a href="https://github.com/unclecode/crawl4ai/network/members">
+      <img src="https://img.shields.io/github/forks/unclecode/crawl4ai?style=social"
+           alt="GitHub Forks"/>
+    </a>
+    <a href="https://badge.fury.io/py/crawl4ai">
+      <img src="https://badge.fury.io/py/crawl4ai.svg"
+           alt="PyPI version"/>
+    </a>
+  </p>
+
+  <p>
+    <a href="https://pypi.org/project/crawl4ai/">
+      <img src="https://img.shields.io/pypi/pyversions/crawl4ai"
+           alt="Python Version"/>
+    </a>
+    <a href="https://pepy.tech/project/crawl4ai">
+      <img src="https://static.pepy.tech/badge/crawl4ai/month"
+           alt="Downloads"/>
+    </a>
+    <a href="https://github.com/unclecode/crawl4ai/blob/main/LICENSE">
+      <img src="https://img.shields.io/github/license/unclecode/crawl4ai"
+           alt="License"/>
+    </a>
+  </p>
+  
+</div>
+
+Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for large language models, AI agents, and data pipelines. Fully open source, flexible, and built for real-time performance, **Crawl4AI** empowers developers with unmatched speed, precision, and deployment ease.
+
+> **Note**: If you're looking for the old documentation, you can access it [here](https://old.docs.crawl4ai.com).
 
-Crawl4AI has one clear task: to make crawling and data extraction from web pages easy and efficient, especially for large language models (LLMs) and AI applications. Whether you are using it as a REST API or a Python library, Crawl4AI offers a robust and flexible solution with full asynchronous support.
 
 ## Quick Start
 
@@ -16,9 +59,9 @@ from crawl4ai import AsyncWebCrawler
 
 async def main():
     # Create an instance of AsyncWebCrawler
-    async with AsyncWebCrawler(verbose=True) as crawler:
+    async with AsyncWebCrawler() as crawler:
         # Run the crawler on a URL
-        result = await crawler.arun(url="https://www.nbcnews.com/business")
+        result = await crawler.arun(url="https://crawl4ai.com")
 
         # Print the extracted content
         print(result.markdown)
@@ -27,87 +70,66 @@ async def main():
 asyncio.run(main())
 ```
 
-## Key Features ✨
+---
 
-- 🆓 Completely free and open-source
-- 🚀 Blazing fast performance, outperforming many paid services
-- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown)
-- 📄 Fit markdown generation for extracting main article content.
-- 🌐 Multi-browser support (Chromium, Firefox, WebKit)
-- 🌍 Supports crawling multiple URLs simultaneously
-- 🎨 Extracts and returns all media tags (Images, Audio, and Video)
-- 🔗 Extracts all external and internal links
-- 📚 Extracts metadata from the page
-- 🔄 Custom hooks for authentication, headers, and page modifications
-- 🕵️ User-agent customization
-- 🖼️ Takes screenshots of pages with enhanced error handling
-- 📜 Executes multiple custom JavaScripts before crawling
-- 📊 Generates structured output without LLM using JsonCssExtractionStrategy
-- 📚 Various chunking strategies: topic-based, regex, sentence, and more
-- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more
-- 🎯 CSS selector support for precise data extraction
-- 📝 Passes instructions/keywords to refine extraction
-- 🔒 Proxy support with authentication for enhanced access
-- 🔄 Session management for complex multi-page crawling
-- 🌐 Asynchronous architecture for improved performance
-- 🖼️ Improved image processing with lazy-loading detection
-- 🕰️ Enhanced handling of delayed content loading
-- 🔑 Custom headers support for LLM interactions
-- 🖼️ iframe content extraction for comprehensive analysis
-- ⏱️ Flexible timeout and delayed content retrieval options
+## What Does Crawl4AI Do?
+
+Crawl4AI is a feature-rich crawler and scraper that aims to:
+
+1. **Generate Clean Markdown**: Perfect for RAG pipelines or direct ingestion into LLMs.  
+2. **Structured Extraction**: Parse repeated patterns with CSS, XPath, or LLM-based extraction.  
+3. **Advanced Browser Control**: Hooks, proxies, stealth modes, session re-use—fine-grained control.  
+4. **High Performance**: Parallel crawling, chunk-based extraction, real-time use cases.  
+5. **Open Source**: No forced API keys, no paywalls—everyone can access their data.  
+
+**Core Philosophies**:
+- **Democratize Data**: Free to use, transparent, and highly configurable.  
+- **LLM Friendly**: Minimally processed, well-structured text, images, and metadata, so AI models can easily consume it.
+
+---
 
 ## Documentation Structure
 
-Our documentation is organized into several sections:
+To help you get started, we’ve organized our docs into clear sections:
 
-### Basic Usage
-- [Installation](basic/installation.md)
-- [Quick Start](basic/quickstart.md)
-- [Simple Crawling](basic/simple-crawling.md)
-- [Browser Configuration](basic/browser-config.md)
-- [Content Selection](basic/content-selection.md)
-- [Output Formats](basic/output-formats.md)
-- [Page Interaction](basic/page-interaction.md)
+- **Setup & Installation**  
+  Basic instructions to install Crawl4AI via pip or Docker.  
+- **Quick Start**  
+  A hands-on introduction showing how to do your first crawl, generate Markdown, and do a simple extraction.  
+- **Core**  
+  Deeper guides on single-page crawling, advanced browser/crawler parameters, content filtering, and caching.  
+- **Advanced**  
+  Explore link & media handling, lazy loading, hooking & authentication, proxies, session management, and more.  
+- **Extraction**  
+  Detailed references for no-LLM (CSS, XPath) vs. LLM-based strategies, chunking, and clustering approaches.  
+- **API Reference**  
+  Find the technical specifics of each class and method, including `AsyncWebCrawler`, `arun()`, and `CrawlResult`.
 
-### Advanced Features
-- [Magic Mode](advanced/magic-mode.md)
-- [Session Management](advanced/session-management.md)
-- [Hooks & Authentication](advanced/hooks-auth.md)
-- [Proxy & Security](advanced/proxy-security.md)
-- [Content Processing](advanced/content-processing.md)
+Throughout these sections, you’ll find code samples you can **copy-paste** into your environment. If something is missing or unclear, raise an issue or PR.
 
-### Extraction & Processing
-- [Extraction Strategies Overview](extraction/overview.md)
-- [LLM Integration](extraction/llm.md)
-- [CSS-Based Extraction](extraction/css.md)
-- [Cosine Strategy](extraction/cosine.md)
-- [Chunking Strategies](extraction/chunking.md)
+---
 
-### API Reference
-- [AsyncWebCrawler](api/async-webcrawler.md)
-- [CrawlResult](api/crawl-result.md)
-- [Extraction Strategies](api/strategies.md)
-- [arun() Method Parameters](api/arun.md)
+## How You Can Support
 
-### Examples
-- Coming soon!
+- **Star & Fork**: If you find Crawl4AI helpful, star the repo on GitHub or fork it to add your own features.  
+- **File Issues**: Encounter a bug or missing feature? Let us know by filing an issue, so we can improve.  
+- **Pull Requests**: Whether it’s a small fix, a big feature, or better docs—contributions are always welcome.  
+- **Join Discord**: Come chat about web scraping, crawling tips, or AI workflows with the community.  
+- **Spread the Word**: Mention Crawl4AI in your blog posts, talks, or on social media.  
 
-## Getting Started
+**Our mission**: to empower everyone—students, researchers, entrepreneurs, data scientists—to access, parse, and shape the world’s data with speed, cost-efficiency, and creative freedom.
 
-1. Install Crawl4AI:
-```bash
-pip install crawl4ai
-```
+---
 
-2. Check out our [Quick Start Guide](basic/quickstart.md) to begin crawling web pages.
+## Quick Links
 
-3. Explore our [examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) to see Crawl4AI in action.
+- **[GitHub Repo](https://github.com/unclecode/crawl4ai)**  
+- **[Installation Guide](./core/installation.md)**  
+- **[Quick Start](./core/quickstart.md)**  
+- **[API Reference](./api/async-webcrawler.md)**  
+- **[Changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md)**  
 
-## Support
+Thank you for joining me on this journey. Let’s keep building an **open, democratic** approach to data extraction and AI together.
 
-For questions, suggestions, or issues:
-- GitHub Issues: [Report a Bug](https://github.com/unclecode/crawl4ai/issues)
-- Twitter: [@unclecode](https://twitter.com/unclecode)
-- Website: [crawl4ai.com](https://crawl4ai.com)
-
-Happy Crawling! 🕸️🚀
\ No newline at end of file
+Happy Crawling!  
+— *Unclecode, Founder & Maintainer of Crawl4AI*  
diff --git a/docs/md_v2/tutorial/episode_01_Introduction_to_Crawl4AI_and_Basic_Installation.md b/docs/md_v2/tutorial/episode_01_Introduction_to_Crawl4AI_and_Basic_Installation.md
deleted file mode 100644
index fb1846b5..00000000
--- a/docs/md_v2/tutorial/episode_01_Introduction_to_Crawl4AI_and_Basic_Installation.md
+++ /dev/null
@@ -1,51 +0,0 @@
-# Crawl4AI
-
-## Episode 1: Introduction to Crawl4AI and Basic Installation
-
-### Quick Intro
-Walk through installation from PyPI, setup, and verification. Show how to install with options like `torch` or `transformer` for advanced capabilities.
-
-Here's a condensed outline of the **Installation and Setup** video content:
-
----
-
-1) **Introduction to Crawl4AI**: Briefly explain that Crawl4AI is a powerful tool for web scraping, data extraction, and content processing, with customizable options for various needs.
-
-2) **Installation Overview**:   
-   
-   - **Basic Install**: Run `pip install crawl4ai` and `playwright install` (to set up browser dependencies).
- 
-   - **Optional Advanced Installs**:
-     - `pip install crawl4ai[torch]` - Adds PyTorch for clustering.
-     - `pip install crawl4ai[transformer]` - Adds support for LLM-based extraction.
-     - `pip install crawl4ai[all]` - Installs all features for complete functionality.
-
-3) **Verifying the Installation**:
-   
-   - Walk through a simple test script to confirm the setup:
-      ```python
-      import asyncio
-      from crawl4ai import AsyncWebCrawler
-      
-      async def main():
-          async with AsyncWebCrawler(verbose=True) as crawler:
-              result = await crawler.arun(url="https://www.example.com")
-              print(result.markdown[:500])  # Show first 500 characters
-
-      asyncio.run(main())
-      ```
-   - Explain that this script initializes the crawler and runs it on a test URL, displaying part of the extracted content to verify functionality.
-
-4) **Important Tips**:
-   
-   - **Run** `playwright install` **after installation** to set up dependencies.
-   - **For full performance** on text-related tasks, run `crawl4ai-download-models` after installing with `[torch]`, `[transformer]`, or `[all]` options.
-   - If you encounter issues, refer to the documentation or GitHub issues.
-
-5) **Wrap Up**:
-   
-   - Introduce the next topic in the series, which will cover Crawl4AI's browser configuration options (like choosing between `chromium`, `firefox`, and `webkit`).
-
----
-
-This structure provides a concise, effective guide to get viewers up and running with Crawl4AI in minutes.
\ No newline at end of file
diff --git a/docs/md_v2/tutorial/episode_02_Overview_of_Advanced_Features.md b/docs/md_v2/tutorial/episode_02_Overview_of_Advanced_Features.md
deleted file mode 100644
index c4fd09df..00000000
--- a/docs/md_v2/tutorial/episode_02_Overview_of_Advanced_Features.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# Crawl4AI
-
-## Episode 2: Overview of Advanced Features
-
-### Quick Intro
-A general overview of advanced features like hooks, CSS selectors, and JSON CSS extraction.
-
-Here's a condensed outline for an **Overview of Advanced Features** video covering Crawl4AI's powerful customization and extraction options:
-
----
-
-### **Overview of Advanced Features**
-
-1) **Introduction to Advanced Features**:
- 
-   - Briefly introduce Crawl4AI’s advanced tools, which let users go beyond basic crawling to customize and fine-tune their scraping workflows.
-
-2) **Taking Screenshots**:
- 
-   - Explain the screenshot capability for capturing page state and verifying content.
-   - **Example**:
-      ```python
-      result = await crawler.arun(url="https://www.example.com", screenshot=True)
-      ```
-   - Mention that screenshots are saved as a base64 string in `result`, allowing easy decoding and saving.
-
-3) **Media and Link Extraction**:
- 
-   - Demonstrate how to pull all media (images, videos) and links (internal and external) from a page for deeper analysis or content gathering.
-   - **Example**:
-      ```python
-      result = await crawler.arun(url="https://www.example.com")
-      print("Media:", result.media)
-      print("Links:", result.links)
-      ```
-
-4) **Custom User Agent**:
- 
-   - Show how to set a custom user agent to disguise the crawler or simulate specific devices/browsers.
-   - **Example**:
-      ```python
-      result = await crawler.arun(url="https://www.example.com", user_agent="Mozilla/5.0 (compatible; MyCrawler/1.0)")
-      ```
-
-5) **Custom Hooks for Enhanced Control**:
- 
-   - Briefly cover how to use hooks, which allow custom actions like setting headers or handling login during the crawl.
-   - **Example**: Setting a custom header with `before_get_url` hook.
-      ```python
-      async def before_get_url(page):
-          await page.set_extra_http_headers({"X-Test-Header": "test"})
-      ```
-
-6) **CSS Selectors for Targeted Extraction**:
- 
-   - Explain the use of CSS selectors to extract specific elements, ideal for structured data like articles or product details.
-   - **Example**:
-      ```python
-      result = await crawler.arun(url="https://www.example.com", css_selector="h2")
-      print("H2 Tags:", result.extracted_content)
-      ```
-
-7) **Crawling Inside Iframes**:
- 
-   - Mention how enabling `process_iframes=True` allows extracting content within iframes, useful for sites with embedded content or ads.
-   - **Example**:
-      ```python
-      result = await crawler.arun(url="https://www.example.com", process_iframes=True)
-      ```
-
-8) **Wrap-Up**:
- 
-   - Summarize these advanced features and how they allow users to customize every part of their web scraping experience.
-   - Tease upcoming videos where each feature will be explored in detail.
-
----
-
-This covers each advanced feature with a brief example, providing a useful overview to prepare viewers for the more in-depth videos.
\ No newline at end of file
diff --git a/docs/md_v2/tutorial/episode_03_Browser_Configurations_&_Headless_Crawling.md b/docs/md_v2/tutorial/episode_03_Browser_Configurations_&_Headless_Crawling.md
deleted file mode 100644
index 45f1a353..00000000
--- a/docs/md_v2/tutorial/episode_03_Browser_Configurations_&_Headless_Crawling.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# Crawl4AI
-
-## Episode 3: Browser Configurations & Headless Crawling
-
-### Quick Intro
-Explain browser options (`chromium`, `firefox`, `webkit`) and settings for headless mode, caching, and verbose logging.
-
-Here’s a streamlined outline for the **Browser Configurations & Headless Crawling** video:
-
----
-
-### **Browser Configurations & Headless Crawling**
-
-1) **Overview of Browser Options**:
-
-   - Crawl4AI supports three browser engines:
-     - **Chromium** (default) - Highly compatible.
-     - **Firefox** - Great for specialized use cases.
-     - **Webkit** - Lightweight, ideal for basic needs.
-   - **Example**:
-      ```python
-      # Using Chromium (default)
-      crawler = AsyncWebCrawler(browser_type="chromium")
-      
-      # Using Firefox
-      crawler = AsyncWebCrawler(browser_type="firefox")
-      
-      # Using WebKit
-      crawler = AsyncWebCrawler(browser_type="webkit")
-      ```
-
-2) **Headless Mode**:
-
-   - Headless mode runs the browser without a visible GUI, making it faster and less resource-intensive.
-   - To enable or disable:
-      ```python
-      # Headless mode (default is True)
-      crawler = AsyncWebCrawler(headless=True)
-      
-      # Disable headless mode for debugging
-      crawler = AsyncWebCrawler(headless=False)
-      ```
-
-3) **Verbose Logging**:
-   - Use `verbose=True` to get detailed logs for each action, useful for debugging:
-      ```python
-      crawler = AsyncWebCrawler(verbose=True)
-      ```
-
-4) **Running a Basic Crawl with Configuration**:
-   - Example of a simple crawl with custom browser settings:
-      ```python
-      async with AsyncWebCrawler(browser_type="firefox", headless=True, verbose=True) as crawler:
-          result = await crawler.arun(url="https://www.example.com")
-          print(result.markdown[:500])  # Show first 500 characters
-      ```
-   - This example uses Firefox in headless mode with logging enabled, demonstrating the flexibility of Crawl4AI’s setup.
-
-5) **Recap & Next Steps**:
-   - Recap the power of selecting different browsers and running headless mode for speed and efficiency.
-   - Tease the next video: **Proxy & Security Settings** for navigating blocked or restricted content and protecting IP identity.
-
----
-
-This breakdown covers browser configuration essentials in Crawl4AI, providing users with practical steps to optimize their scraping setup.
\ No newline at end of file
diff --git a/docs/md_v2/tutorial/episode_04_Advanced_Proxy_and_Security_Settings.md b/docs/md_v2/tutorial/episode_04_Advanced_Proxy_and_Security_Settings.md
deleted file mode 100644
index ea235962..00000000
--- a/docs/md_v2/tutorial/episode_04_Advanced_Proxy_and_Security_Settings.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# Crawl4AI
-
-## Episode 4: Advanced Proxy and Security Settings
-
-### Quick Intro
-Showcase proxy configurations (HTTP, SOCKS5, authenticated proxies). Demo: Use rotating proxies and set custom headers to avoid IP blocking and enhance security.
-
-Here’s a focused outline for the **Proxy and Security Settings** video:
-
----
-
-### **Proxy & Security Settings**
-
-1) **Why Use Proxies in Web Crawling**:
-
-   - Proxies are essential for bypassing IP-based restrictions, improving anonymity, and managing rate limits.
-   - Crawl4AI supports simple proxies, authenticated proxies, and proxy rotation for robust web scraping.
-
-2) **Basic Proxy Setup**:
-
-   - **Using a Simple Proxy**:
-     ```python
-     # HTTP proxy
-     crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080")
-     
-     # SOCKS proxy
-     crawler = AsyncWebCrawler(proxy="socks5://proxy.example.com:1080")
-     ```
-
-3) **Authenticated Proxies**:
-
-   - Use `proxy_config` for proxies requiring a username and password:
-     ```python
-     proxy_config = {
-         "server": "http://proxy.example.com:8080",
-         "username": "user",
-         "password": "pass"
-     }
-     crawler = AsyncWebCrawler(proxy_config=proxy_config)
-     ```
-
-4) **Rotating Proxies**:
-
-   - Rotating proxies helps avoid IP bans by switching IP addresses for each request:
-     ```python
-     async def get_next_proxy():
-         # Define proxy rotation logic here
-         return {"server": "http://next.proxy.com:8080"}
-     
-     async with AsyncWebCrawler() as crawler:
-         for url in urls:
-             proxy = await get_next_proxy()
-             crawler.update_proxy(proxy)
-             result = await crawler.arun(url=url)
-     ```
-   - This setup periodically switches the proxy for enhanced security and access.
-
-5) **Custom Headers for Additional Security**:
-
-   - Set custom headers to mask the crawler’s identity and avoid detection:
-     ```python
-     headers = {
-         "X-Forwarded-For": "203.0.113.195",
-         "Accept-Language": "en-US,en;q=0.9",
-         "Cache-Control": "no-cache",
-         "Pragma": "no-cache"
-     }
-     crawler = AsyncWebCrawler(headers=headers)
-     ```
-
-6) **Combining Proxies with Magic Mode for Anti-Bot Protection**:
-
-   - For sites with aggressive bot detection, combine `proxy` settings with `magic=True`:
-     ```python
-     async with AsyncWebCrawler(proxy="http://proxy.example.com:8080", headers={"Accept-Language": "en-US"}) as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             magic=True  # Enables anti-detection features
-         )
-     ```
-   - **Magic Mode** automatically enables user simulation, random timing, and browser property masking.
-
-7) **Wrap Up & Next Steps**:
-
-   - Summarize the importance of proxies and anti-detection in accessing restricted content and avoiding bans.
-   - Tease the next video: **JavaScript Execution and Handling Dynamic Content** for working with interactive and dynamically loaded pages.
-
----
-
-This outline provides a practical guide to setting up proxies and security configurations, empowering users to navigate restricted sites while staying undetected.
\ No newline at end of file
diff --git a/docs/md_v2/tutorial/episode_05_JavaScript_Execution_and_Dynamic_Content_Handling.md b/docs/md_v2/tutorial/episode_05_JavaScript_Execution_and_Dynamic_Content_Handling.md
deleted file mode 100644
index 98d0968f..00000000
--- a/docs/md_v2/tutorial/episode_05_JavaScript_Execution_and_Dynamic_Content_Handling.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# Crawl4AI
-
-## Episode 5: JavaScript Execution and Dynamic Content Handling
-
-### Quick Intro
-Explain JavaScript code injection with examples (e.g., simulating scrolling, clicking ‘load more’). Demo: Extract content from a page that uses dynamic loading with lazy-loaded images.
-
-Here’s a focused outline for the **JavaScript Execution and Dynamic Content Handling** video:
-
----
-
-### **JavaScript Execution & Dynamic Content Handling**
-
-1) **Why JavaScript Execution Matters**:
-
-   - Many modern websites load content dynamically via JavaScript, requiring special handling to access all elements.
-   - Crawl4AI can execute JavaScript on pages, enabling it to interact with elements like “load more” buttons, infinite scrolls, and content that appears only after certain actions.
-
-2) **Basic JavaScript Execution**:
-
-   - Use `js_code` to execute JavaScript commands on a page:
-     ```python
-     # Scroll to bottom of the page
-     result = await crawler.arun(
-         url="https://example.com",
-         js_code="window.scrollTo(0, document.body.scrollHeight);"
-     )
-     ```
-   - This command scrolls to the bottom, triggering any lazy-loaded or dynamically added content.
-
-3) **Multiple Commands & Simulating Clicks**:
-
-   - Combine multiple JavaScript commands to interact with elements like “load more” buttons:
-     ```python
-     js_commands = [
-         "window.scrollTo(0, document.body.scrollHeight);",
-         "document.querySelector('.load-more').click();"
-     ]
-     result = await crawler.arun(
-         url="https://example.com",
-         js_code=js_commands
-     )
-     ```
-   - This script scrolls down and then clicks the “load more” button, useful for loading additional content blocks.
-
-4) **Waiting for Dynamic Content**:
-
-   - Use `wait_for` to ensure the page loads specific elements before proceeding:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         js_code="window.scrollTo(0, document.body.scrollHeight);",
-         wait_for="css:.dynamic-content"  # Wait for elements with class `.dynamic-content`
-     )
-     ```
-   - This example waits until elements with `.dynamic-content` are loaded, helping to capture content that appears after JavaScript actions.
-
-5) **Handling Complex Dynamic Content (e.g., Infinite Scroll)**:
-
-   - Combine JavaScript execution with conditional waiting to handle infinite scrolls or paginated content:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         js_code=[
-             "window.scrollTo(0, document.body.scrollHeight);",
-             "const loadMore = document.querySelector('.load-more'); if (loadMore) loadMore.click();"
-         ],
-         wait_for="js:() => document.querySelectorAll('.item').length > 10"  # Wait until 10 items are loaded
-     )
-     ```
-   - This example scrolls and clicks "load more" repeatedly, waiting each time for a specified number of items to load.
-
-6) **Complete Example: Dynamic Content Handling with Extraction**:
-
-   - Full example demonstrating a dynamic load and content extraction in one process:
-     ```python
-     async with AsyncWebCrawler() as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             js_code=[
-                 "window.scrollTo(0, document.body.scrollHeight);",
-                 "document.querySelector('.load-more').click();"
-             ],
-             wait_for="css:.main-content",
-             css_selector=".main-content"
-         )
-         print(result.markdown[:500])  # Output the main content extracted
-     ```
-
-7) **Wrap Up & Next Steps**:
-
-   - Recap how JavaScript execution allows access to dynamic content, enabling powerful interactions.
-   - Tease the next video: **Content Cleaning and Fit Markdown** to show how Crawl4AI can extract only the most relevant content from complex pages.
-
----
-
-This outline explains how to handle dynamic content and JavaScript-based interactions effectively, enabling users to scrape and interact with complex, modern websites.
\ No newline at end of file
diff --git a/docs/md_v2/tutorial/episode_06_Magic_Mode_and_Anti-Bot_Protection.md b/docs/md_v2/tutorial/episode_06_Magic_Mode_and_Anti-Bot_Protection.md
deleted file mode 100644
index dfc3e5a2..00000000
--- a/docs/md_v2/tutorial/episode_06_Magic_Mode_and_Anti-Bot_Protection.md
+++ /dev/null
@@ -1,86 +0,0 @@
-# Crawl4AI
-
-## Episode 6: Magic Mode and Anti-Bot Protection
-
-### Quick Intro
-Highlight `Magic Mode` and anti-bot features like user simulation, navigator overrides, and timing randomization. Demo: Access a site with anti-bot protection and show how `Magic Mode` seamlessly handles it.
-
-Here’s a concise outline for the **Magic Mode and Anti-Bot Protection** video:
-
----
-
-### **Magic Mode & Anti-Bot Protection**
-
-1) **Why Anti-Bot Protection is Important**:
-
-   - Many websites use bot detection mechanisms to block automated scraping. Crawl4AI’s anti-detection features help avoid IP bans, CAPTCHAs, and access restrictions.
-   - **Magic Mode** is a one-step solution to enable a range of anti-bot features without complex configuration.
-
-2) **Enabling Magic Mode**:
-
-   - Simply set `magic=True` to activate Crawl4AI’s full anti-bot suite:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         magic=True  # Enables all anti-detection features
-     )
-     ```
-   - This enables a blend of stealth techniques, including masking automation signals, randomizing timings, and simulating real user behavior.
-
-3) **What Magic Mode Does Behind the Scenes**:
-
-   - **User Simulation**: Mimics human actions like mouse movements and scrolling.
-   - **Navigator Overrides**: Hides signals that indicate an automated browser.
-   - **Timing Randomization**: Adds random delays to simulate natural interaction patterns.
-   - **Cookie Handling**: Accepts and manages cookies dynamically to avoid triggers from cookie pop-ups.
-
-4) **Manual Anti-Bot Options (If Not Using Magic Mode)**:
-
-   - For granular control, you can configure individual settings without Magic Mode:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         simulate_user=True,        # Enables human-like behavior
-         override_navigator=True    # Hides automation fingerprints
-     )
-     ```
-   - **Use Cases**: This approach allows more specific adjustments when certain anti-bot features are needed but others are not.
-
-5) **Combining Proxies with Magic Mode**:
-
-   - To avoid rate limits or IP blocks, combine Magic Mode with a proxy:
-     ```python
-     async with AsyncWebCrawler(
-         proxy="http://proxy.example.com:8080",
-         headers={"Accept-Language": "en-US"}
-     ) as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             magic=True  # Full anti-detection
-         )
-     ```
-   - This setup maximizes stealth by pairing anti-bot detection with IP obfuscation.
-
-6) **Example of Anti-Bot Protection in Action**:
-
-   - Full example with Magic Mode and proxies to scrape a protected page:
-     ```python
-     async with AsyncWebCrawler() as crawler:
-         result = await crawler.arun(
-             url="https://example.com/protected-content",
-             magic=True,
-             proxy="http://proxy.example.com:8080",
-             wait_for="css:.content-loaded"  # Wait for the main content to load
-         )
-         print(result.markdown[:500])  # Display first 500 characters of the content
-     ```
-   - This example ensures seamless access to protected content by combining anti-detection and waiting for full content load.
-
-7) **Wrap Up & Next Steps**:
-
-   - Recap the power of Magic Mode and anti-bot features for handling restricted websites.
-   - Tease the next video: **Content Cleaning and Fit Markdown** to show how to extract clean and focused content from a page.
-
----
-
-This outline shows users how to easily avoid bot detection and access restricted content, demonstrating both the power and simplicity of Magic Mode in Crawl4AI.
\ No newline at end of file
diff --git a/docs/md_v2/tutorial/episode_07_Content_Cleaning_and_Fit_Markdown.md b/docs/md_v2/tutorial/episode_07_Content_Cleaning_and_Fit_Markdown.md
deleted file mode 100644
index 60ef9eea..00000000
--- a/docs/md_v2/tutorial/episode_07_Content_Cleaning_and_Fit_Markdown.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# Crawl4AI
-
-## Episode 7: Content Cleaning and Fit Markdown
-
-### Quick Intro
-Explain content cleaning options, including `fit_markdown` to keep only the most relevant content. Demo: Extract and compare regular vs. fit markdown from a news site or blog.
-
-Here’s a streamlined outline for the **Content Cleaning and Fit Markdown** video:
-
----
-
-### **Content Cleaning & Fit Markdown**
-
-1) **Overview of Content Cleaning in Crawl4AI**:
-
-   - Explain that web pages often include extra elements like ads, navigation bars, footers, and popups.
-   - Crawl4AI’s content cleaning features help extract only the main content, reducing noise and enhancing readability.
-
-2) **Basic Content Cleaning Options**:
-
-   - **Removing Unwanted Elements**: Exclude specific HTML tags, like forms or navigation bars:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         word_count_threshold=10,       # Filter out blocks with fewer than 10 words
-         excluded_tags=['form', 'nav'], # Exclude specific tags
-         remove_overlay_elements=True   # Remove popups and modals
-     )
-     ```
-   - This example extracts content while excluding forms, navigation, and modal overlays, ensuring clean results.
-
-3) **Fit Markdown for Main Content Extraction**:
-
-   - **What is Fit Markdown**: Uses advanced analysis to identify the most relevant content (ideal for articles, blogs, and documentation).
-   - **How it Works**: Analyzes content density, removes boilerplate elements, and maintains formatting for a clear output.
-   - **Example**:
-     ```python
-     result = await crawler.arun(url="https://example.com")
-     main_content = result.fit_markdown  # Extracted main content
-     print(main_content[:500])  # Display first 500 characters
-     ```
-   - Fit Markdown is especially helpful for long-form content like news articles or blog posts.
-
-4) **Comparing Fit Markdown with Regular Markdown**:
-
-   - **Fit Markdown** returns the primary content without extraneous elements.
-   - **Regular Markdown** includes all extracted text in markdown format.
-   - Example to show the difference:
-     ```python
-     all_content = result.markdown      # Full markdown
-     main_content = result.fit_markdown # Only the main content
-     
-     print(f"All Content Length: {len(all_content)}")
-     print(f"Main Content Length: {len(main_content)}")
-     ```
-   - This comparison shows the effectiveness of Fit Markdown in focusing on essential content.
-
-5) **Media and Metadata Handling with Content Cleaning**:
-
-   - **Media Extraction**: Crawl4AI captures images and videos with metadata like alt text, descriptions, and relevance scores:
-     ```python
-     for image in result.media["images"]:
-         print(f"Source: {image['src']}, Alt Text: {image['alt']}, Relevance Score: {image['score']}")
-     ```
-   - **Use Case**: Useful for saving only relevant images or videos from an article or content-heavy page.
-
-6) **Example of Clean Content Extraction in Action**:
-
-   - Full example extracting cleaned content and Fit Markdown:
-     ```python
-     async with AsyncWebCrawler() as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             word_count_threshold=10,
-             excluded_tags=['nav', 'footer'],
-             remove_overlay_elements=True
-         )
-         print(result.fit_markdown[:500])  # Show main content
-     ```
-   - This example demonstrates content cleaning with settings for filtering noise and focusing on the core text.
-
-7) **Wrap Up & Next Steps**:
-
-   - Summarize the power of Crawl4AI’s content cleaning features and Fit Markdown for capturing clean, relevant content.
-   - Tease the next video: **Link Analysis and Smart Filtering** to focus on analyzing and filtering links within crawled pages.
-
----
-
-This outline covers Crawl4AI’s content cleaning features and the unique benefits of Fit Markdown, showing users how to retrieve focused, high-quality content from web pages.
\ No newline at end of file
diff --git a/docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md b/docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md
deleted file mode 100644
index c0daacad..00000000
--- a/docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md
+++ /dev/null
@@ -1,116 +0,0 @@
-# Crawl4AI
-
-## Episode 8: Media Handling: Images, Videos, and Audio
-
-### Quick Intro
-Showcase Crawl4AI’s media extraction capabilities, including lazy-loaded media and metadata. Demo: Crawl a multimedia page, extract images, and show metadata (alt text, context, relevance score).
-
-Here’s a clear and focused outline for the **Media Handling: Images, Videos, and Audio** video:
-
----
-
-### **Media Handling: Images, Videos, and Audio**
-
-1) **Overview of Media Extraction in Crawl4AI**:
-
-   - Crawl4AI can detect and extract different types of media (images, videos, and audio) along with useful metadata.
-   - This functionality is essential for gathering visual content from multimedia-heavy pages like e-commerce sites, news articles, and social media feeds.
-
-2) **Image Extraction and Metadata**:
-
-   - Crawl4AI captures images with detailed metadata, including:
-     - **Source URL**: The direct URL to the image.
-     - **Alt Text**: Image description if available.
-     - **Relevance Score**: A score (0–10) indicating how relevant the image is to the main content.
-     - **Context**: Text surrounding the image on the page.
-   - **Example**:
-     ```python
-     result = await crawler.arun(url="https://example.com")
-     
-     for image in result.media["images"]:
-         print(f"Source: {image['src']}")
-         print(f"Alt Text: {image['alt']}")
-         print(f"Relevance Score: {image['score']}")
-         print(f"Context: {image['context']}")
-     ```
-   - This example shows how to access each image’s metadata, making it easy to filter for the most relevant visuals.
-
-3) **Handling Lazy-Loaded Images**:
-
-   - Crawl4AI automatically supports lazy-loaded images, which are commonly used to optimize webpage loading.
-   - **Example with Wait for Lazy-Loaded Content**:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         wait_for="css:img[data-src]",  # Wait for lazy-loaded images
-         delay_before_return_html=2.0   # Allow extra time for images to load
-     )
-     ```
-   - This setup waits for lazy-loaded images to appear, ensuring they are fully captured.
-
-4) **Video Extraction and Metadata**:
-
-   - Crawl4AI captures video elements, including:
-     - **Source URL**: The video’s direct URL.
-     - **Type**: Format of the video (e.g., MP4).
-     - **Thumbnail**: A poster or thumbnail image if available.
-     - **Duration**: Video length, if metadata is provided.
-   - **Example**:
-     ```python
-     for video in result.media["videos"]:
-         print(f"Video Source: {video['src']}")
-         print(f"Type: {video['type']}")
-         print(f"Thumbnail: {video.get('poster')}")
-         print(f"Duration: {video.get('duration')}")
-     ```
-   - This allows users to gather video content and relevant details for further processing or analysis.
-
-5) **Audio Extraction and Metadata**:
-
-   - Audio elements can also be extracted, with metadata like:
-     - **Source URL**: The audio file’s direct URL.
-     - **Type**: Format of the audio file (e.g., MP3).
-     - **Duration**: Length of the audio, if available.
-   - **Example**:
-     ```python
-     for audio in result.media["audios"]:
-         print(f"Audio Source: {audio['src']}")
-         print(f"Type: {audio['type']}")
-         print(f"Duration: {audio.get('duration')}")
-     ```
-   - Useful for sites with podcasts, sound bites, or other audio content.
-
-6) **Filtering Media by Relevance**:
-
-   - Use metadata like relevance score to filter only the most useful media content:
-     ```python
-     relevant_images = [img for img in result.media["images"] if img['score'] > 5]
-     ```
-   - This is especially helpful for content-heavy pages where you only want media directly related to the main content.
-
-7) **Example: Full Media Extraction with Content Filtering**:
-
-   - Full example extracting images, videos, and audio along with filtering by relevance:
-     ```python
-     async with AsyncWebCrawler() as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             word_count_threshold=10,  # Filter content blocks for relevance
-             exclude_external_images=True  # Only keep internal images
-         )
-         
-         # Display media summaries
-         print(f"Relevant Images: {len(relevant_images)}")
-         print(f"Videos: {len(result.media['videos'])}")
-         print(f"Audio Clips: {len(result.media['audios'])}")
-     ```
-   - This example shows how to capture and filter various media types, focusing on what’s most relevant.
-
-8) **Wrap Up & Next Steps**:
-
-   - Recap the comprehensive media extraction capabilities, emphasizing how metadata helps users focus on relevant content.
-   - Tease the next video: **Link Analysis and Smart Filtering** to explore how Crawl4AI handles internal, external, and social media links for more focused data gathering.
-
----
-
-This outline provides users with a complete guide to handling images, videos, and audio in Crawl4AI, using metadata to enhance relevance and precision in multimedia extraction.
diff --git a/docs/md_v2/tutorial/episode_09_Link_Analysis_and_Smart_Filtering.md b/docs/md_v2/tutorial/episode_09_Link_Analysis_and_Smart_Filtering.md
deleted file mode 100644
index 263d77bb..00000000
--- a/docs/md_v2/tutorial/episode_09_Link_Analysis_and_Smart_Filtering.md
+++ /dev/null
@@ -1,95 +0,0 @@
-# Crawl4AI
-
-## Episode 9: Link Analysis and Smart Filtering
-
-### Quick Intro
-Walk through internal and external link classification, social media link filtering, and custom domain exclusion. Demo: Analyze links on a website, focusing on internal navigation vs. external or ad links.
-
-Here’s a focused outline for the **Link Analysis and Smart Filtering** video:
-
----
-
-### **Link Analysis & Smart Filtering**
-
-1) **Importance of Link Analysis in Web Crawling**:
-
-   - Explain that web pages often contain numerous links, including internal links, external links, social media links, and ads.
-   - Crawl4AI’s link analysis and filtering options help extract only relevant links, enabling more targeted and efficient crawls.
-
-2) **Automatic Link Classification**:
-
-   - Crawl4AI categorizes links automatically into internal, external, and social media links.
-   - **Example**:
-     ```python
-     result = await crawler.arun(url="https://example.com")
-
-     # Access internal and external links
-     internal_links = result.links["internal"]
-     external_links = result.links["external"]
-
-     # Print first few links for each type
-     print("Internal Links:", internal_links[:3])
-     print("External Links:", external_links[:3])
-     ```
-
-3) **Filtering Out Unwanted Links**:
-
-   - **Exclude External Links**: Remove all links pointing to external sites.
-   - **Exclude Social Media Links**: Filter out social media domains like Facebook or Twitter.
-   - **Example**:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         exclude_external_links=True,         # Remove external links
-         exclude_social_media_links=True      # Remove social media links
-     )
-     ```
-
-4) **Custom Domain Filtering**:
-
-   - **Exclude Specific Domains**: Filter links from particular domains, e.g., ad sites.
-   - **Custom Social Media Domains**: Add additional social media domains if needed.
-   - **Example**:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         exclude_domains=["ads.com", "trackers.com"],
-         exclude_social_media_domains=["facebook.com", "linkedin.com"]
-     )
-     ```
-
-5) **Accessing Link Context and Metadata**:
-
-   - Crawl4AI provides additional metadata for each link, including its text, type (e.g., navigation or content), and surrounding context.
-   - **Example**:
-     ```python
-     for link in result.links["internal"]:
-         print(f"Link: {link['href']}, Text: {link['text']}, Context: {link['context']}")
-     ```
-   - **Use Case**: Helps users understand the relevance of links based on where they are placed on the page (e.g., navigation vs. article content).
-
-6) **Example of Comprehensive Link Filtering and Analysis**:
-
-   - Full example combining link filtering, metadata access, and contextual information:
-     ```python
-     async with AsyncWebCrawler() as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             exclude_external_links=True,
-             exclude_social_media_links=True,
-             exclude_domains=["ads.com"],
-             css_selector=".main-content"  # Focus only on main content area
-         )
-         for link in result.links["internal"]:
-             print(f"Internal Link: {link['href']}, Text: {link['text']}, Context: {link['context']}")
-     ```
-   - This example filters unnecessary links, keeping only internal and relevant links from the main content area.
-
-7) **Wrap Up & Next Steps**:
-
-   - Summarize the benefits of link filtering for efficient crawling and relevant content extraction.
-   - Tease the next video: **Custom Headers, Identity Management, and User Simulation** to explain how to configure identity settings and simulate user behavior for stealthier crawls.
-
----
-
-This outline provides a practical overview of Crawl4AI’s link analysis and filtering features, helping users target only essential links while eliminating distractions.
\ No newline at end of file
diff --git a/docs/md_v2/tutorial/episode_10_Custom_Headers,_Identity,_and_User_Simulation.md b/docs/md_v2/tutorial/episode_10_Custom_Headers,_Identity,_and_User_Simulation.md
deleted file mode 100644
index 6eb928f0..00000000
--- a/docs/md_v2/tutorial/episode_10_Custom_Headers,_Identity,_and_User_Simulation.md
+++ /dev/null
@@ -1,93 +0,0 @@
-# Crawl4AI
-
-## Episode 10: Custom Headers, Identity, and User Simulation
-
-### Quick Intro
-Teach how to use custom headers, user-agent strings, and simulate real user interactions. Demo: Set custom user-agent and headers to access a site that blocks typical crawlers.
-
-Here’s a concise outline for the **Custom Headers, Identity Management, and User Simulation** video:
-
----
-
-### **Custom Headers, Identity Management, & User Simulation**
-
-1) **Why Customize Headers and Identity in Crawling**:
-
-   - Websites often track request headers and browser properties to detect bots. Customizing headers and managing identity help make requests appear more human, improving access to restricted sites.
-
-2) **Setting Custom Headers**:
-
-   - Customize HTTP headers to mimic genuine browser requests or meet site-specific requirements:
-     ```python
-     headers = {
-         "Accept-Language": "en-US,en;q=0.9",
-         "X-Requested-With": "XMLHttpRequest",
-         "Cache-Control": "no-cache"
-     }
-     crawler = AsyncWebCrawler(headers=headers)
-     ```
-   - **Use Case**: Customize the `Accept-Language` header to simulate local user settings, or `Cache-Control` to bypass cache for fresh content.
-
-3) **Setting a Custom User Agent**:
-
-   - Some websites block requests from common crawler user agents. Setting a custom user agent string helps bypass these restrictions:
-     ```python
-     crawler = AsyncWebCrawler(
-         user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-     )
-     ```
-   - **Tip**: Use user-agent strings from popular browsers (e.g., Chrome, Firefox) to improve access and reduce detection risks.
-
-4) **User Simulation for Human-like Behavior**:
-
-   - Enable `simulate_user=True` to mimic natural user interactions, such as random timing and simulated mouse movements:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         simulate_user=True  # Simulates human-like behavior
-     )
-     ```
-   - **Behavioral Effects**: Adds subtle variations in interactions, making the crawler harder to detect on bot-protected sites.
-
-5) **Navigator Overrides and Magic Mode for Full Identity Masking**:
-
-   - Use `override_navigator=True` to mask automation indicators like `navigator.webdriver`, which websites check to detect bots:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         override_navigator=True  # Masks bot-related signals
-     )
-     ```
-   - **Combining with Magic Mode**: For a complete anti-bot setup, combine these identity options with `magic=True` for maximum protection:
-     ```python
-     async with AsyncWebCrawler() as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             magic=True,  # Enables all anti-bot detection features
-             user_agent="Custom-Agent",  # Custom agent with Magic Mode
-         )
-     ```
-   - This setup includes all anti-detection techniques like navigator masking, random timing, and user simulation.
-
-6) **Example: Comprehensive Setup for Identity Management**:
-
-   - A full example combining custom headers, user-agent, and user simulation for a realistic browsing profile:
-     ```python
-     async with AsyncWebCrawler(
-         headers={"Accept-Language": "en-US", "Cache-Control": "no-cache"},
-         user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0",
-         simulate_user=True
-     ) as crawler:
-         result = await crawler.arun(url="https://example.com/secure-page")
-         print(result.markdown[:500])  # Display extracted content
-     ```
-   - This example enables detailed customization for evading detection and accessing protected pages smoothly.
-
-7) **Wrap Up & Next Steps**:
-
-   - Recap the value of headers, user-agent customization, and simulation in bypassing bot detection.
-   - Tease the next video: **Extraction Strategies: JSON CSS, LLM, and Cosine** to dive into structured data extraction methods for high-quality content retrieval.
-
----
-
-This outline equips users with tools for managing crawler identity and human-like behavior, essential for accessing bot-protected or restricted websites.
\ No newline at end of file
diff --git a/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md b/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md
deleted file mode 100644
index b460ff8c..00000000
--- a/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md
+++ /dev/null
@@ -1,186 +0,0 @@
-Here’s a detailed outline for the **JSON-CSS Extraction Strategy** video, covering all key aspects and supported structures in Crawl4AI:
-
----
-
-### **10.1 JSON-CSS Extraction Strategy**
-
-#### **1. Introduction to JSON-CSS Extraction**
-   - JSON-CSS Extraction is used for pulling structured data from pages with repeated patterns, like product listings, article feeds, or directories.
-   - This strategy allows defining a schema with CSS selectors and data fields, making it easy to capture nested, list-based, or singular elements.
-
-#### **2. Basic Schema Structure**
-   - **Schema Fields**: The schema has two main components:
-     - `baseSelector`: A CSS selector to locate the main elements you want to extract (e.g., each article or product block).
-     - `fields`: Defines the data fields for each element, supporting various data types and structures.
-
-#### **3. Simple Field Extraction**
-   - **Example HTML**:
-     ```html
-     <div class="product">
-         <h2 class="title">Sample Product</h2>
-         <span class="price">$19.99</span>
-         <p class="description">This is a sample product.</p>
-     </div>
-     ```
-   - **Schema**:
-     ```python
-     schema = {
-         "baseSelector": ".product",
-         "fields": [
-             {"name": "title", "selector": ".title", "type": "text"},
-             {"name": "price", "selector": ".price", "type": "text"},
-             {"name": "description", "selector": ".description", "type": "text"}
-         ]
-     }
-     ```
-   - **Explanation**: Each field captures text content from specified CSS selectors within each `.product` element.
-
-#### **4. Supported Field Types: Text, Attribute, HTML, Regex**
-   - **Field Type Options**:
-     - `text`: Extracts visible text.
-     - `attribute`: Captures an HTML attribute (e.g., `src`, `href`).
-     - `html`: Extracts the raw HTML of an element.
-     - `regex`: Allows regex patterns to extract part of the text.
-
-   - **Example HTML** (including an image):
-     ```html
-     <div class="product">
-         <h2 class="title">Sample Product</h2>
-         <img class="product-image" src="image.jpg" alt="Product Image">
-         <span class="price">$19.99</span>
-         <p class="description">Limited time offer.</p>
-     </div>
-     ```
-   - **Schema**:
-     ```python
-     schema = {
-         "baseSelector": ".product",
-         "fields": [
-             {"name": "title", "selector": ".title", "type": "text"},
-             {"name": "image_url", "selector": ".product-image", "type": "attribute", "attribute": "src"},
-             {"name": "price", "selector": ".price", "type": "regex", "pattern": r"\$(\d+\.\d+)"},
-             {"name": "description_html", "selector": ".description", "type": "html"}
-         ]
-     }
-     ```
-   - **Explanation**:
-     - `attribute`: Extracts the `src` attribute from `.product-image`.
-     - `regex`: Extracts the numeric part from `$19.99`.
-     - `html`: Retrieves the full HTML of the description element.
-
-#### **5. Nested Field Extraction**
-   - **Use Case**: Useful when content contains sub-elements, such as an article with author details within it.
-   - **Example HTML**:
-     ```html
-     <div class="article">
-         <h1 class="title">Sample Article</h1>
-         <div class="author">
-             <span class="name">John Doe</span>
-             <span class="bio">Writer and editor</span>
-         </div>
-     </div>
-     ```
-   - **Schema**:
-     ```python
-     schema = {
-         "baseSelector": ".article",
-         "fields": [
-             {"name": "title", "selector": ".title", "type": "text"},
-             {"name": "author", "type": "nested", "selector": ".author", "fields": [
-                 {"name": "name", "selector": ".name", "type": "text"},
-                 {"name": "bio", "selector": ".bio", "type": "text"}
-             ]}
-         ]
-     }
-     ```
-   - **Explanation**:
-     - `nested`: Extracts `name` and `bio` within `.author`, grouping the author details in a single `author` object.
-
-#### **6. List and Nested List Extraction**
-   - **List**: Extracts multiple elements matching the selector as a list.
-   - **Nested List**: Allows lists within lists, useful for items with sub-lists (e.g., specifications for each product).
-   - **Example HTML**:
-     ```html
-     <div class="product">
-         <h2 class="title">Product with Features</h2>
-         <ul class="features">
-             <li class="feature">Feature 1</li>
-             <li class="feature">Feature 2</li>
-             <li class="feature">Feature 3</li>
-         </ul>
-     </div>
-     ```
-   - **Schema**:
-     ```python
-     schema = {
-         "baseSelector": ".product",
-         "fields": [
-             {"name": "title", "selector": ".title", "type": "text"},
-             {"name": "features", "type": "list", "selector": ".features .feature", "fields": [
-                 {"name": "feature", "type": "text"}
-             ]}
-         ]
-     }
-     ```
-   - **Explanation**:
-     - `list`: Captures each `.feature` item within `.features`, outputting an array of features under the `features` field.
-
-#### **7. Transformations for Field Values**
-   - Transformations allow you to modify extracted values (e.g., converting to lowercase).
-   - Supported transformations: `lowercase`, `uppercase`, `strip`.
-   - **Example HTML**:
-     ```html
-     <div class="product">
-         <h2 class="title">Special Product</h2>
-     </div>
-     ```
-   - **Schema**:
-     ```python
-     schema = {
-         "baseSelector": ".product",
-         "fields": [
-             {"name": "title", "selector": ".title", "type": "text", "transform": "uppercase"}
-         ]
-     }
-     ```
-   - **Explanation**: The `transform` property changes the `title` to uppercase, useful for standardized outputs.
-
-#### **8. Full JSON-CSS Extraction Example**
-   - Combining all elements in a single schema example for a comprehensive crawl:
-   - **Example HTML**:
-     ```html
-     <div class="product">
-         <h2 class="title">Featured Product</h2>
-         <img class="product-image" src="product.jpg">
-         <span class="price">$99.99</span>
-         <p class="description">Best product of the year.</p>
-         <ul class="features">
-             <li class="feature">Durable</li>
-             <li class="feature">Eco-friendly</li>
-         </ul>
-     </div>
-     ```
-   - **Schema**:
-     ```python
-     schema = {
-         "baseSelector": ".product",
-         "fields": [
-             {"name": "title", "selector": ".title", "type": "text", "transform": "uppercase"},
-             {"name": "image_url", "selector": ".product-image", "type": "attribute", "attribute": "src"},
-             {"name": "price", "selector": ".price", "type": "regex", "pattern": r"\$(\d+\.\d+)"},
-             {"name": "description", "selector": ".description", "type": "html"},
-             {"name": "features", "type": "list", "selector": ".features .feature", "fields": [
-                 {"name": "feature", "type": "text"}
-             ]}
-         ]
-     }
-     ```
-   - **Explanation**: This schema captures and transforms each aspect of the product, illustrating the JSON-CSS strategy’s versatility for structured extraction.
-
-#### **9. Wrap Up & Next Steps**
-   - Summarize JSON-CSS Extraction’s flexibility for structured, pattern-based extraction.
-   - Tease the next video: **10.2 LLM Extraction Strategy**, focusing on using language models to extract data based on intelligent content analysis.
-
----
-
-This outline covers each JSON-CSS Extraction option in Crawl4AI, with practical examples and schema configurations, making it a thorough guide for users.
diff --git a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md
deleted file mode 100644
index a9f00e92..00000000
--- a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md
+++ /dev/null
@@ -1,153 +0,0 @@
-# Crawl4AI
-
-## Episode 11: Extraction Strategies: JSON CSS, LLM, and Cosine
-
-### Quick Intro
-Introduce JSON CSS Extraction Strategy for structured data, LLM Extraction Strategy for intelligent parsing, and Cosine Strategy for clustering similar content. Demo: Use JSON CSS to scrape product details from an e-commerce site.
-
-Here’s a comprehensive outline for the **LLM Extraction Strategy** video, covering key details and example applications.
-
----
-
-### **10.2 LLM Extraction Strategy**
-
-#### **1. Introduction to LLM Extraction Strategy**
-   - The LLM Extraction Strategy leverages language models to interpret and extract structured data from complex web content.
-   - Unlike traditional CSS selectors, this strategy uses natural language instructions and schemas to guide the extraction, ideal for unstructured or diverse content.
-   - Supports **OpenAI**, **Azure OpenAI**, **HuggingFace**, and **Ollama** models, enabling flexibility with both proprietary and open-source providers.
-
-#### **2. Key Components of LLM Extraction Strategy**
-   - **Provider**: Specifies the LLM provider (e.g., OpenAI, HuggingFace, Azure).
-   - **API Token**: Required for most providers, except Ollama (local LLM model).
-   - **Instruction**: Custom extraction instructions sent to the model, providing flexibility in how the data is structured and extracted.
-   - **Schema**: Optional, defines structured fields to organize extracted data into JSON format.
-   - **Extraction Type**: Supports `"block"` for simpler text blocks or `"schema"` when a structured output format is required.
-   - **Chunking Parameters**: Breaks down large documents, with options to adjust chunk size and overlap rate for more accurate extraction across lengthy texts.
-
-#### **3. Basic Extraction Example: OpenAI Model Pricing**
-   - **Goal**: Extract model names and their input and output fees from the OpenAI pricing page.
-   - **Schema Definition**:
-     - **Model Name**: Text for model identification.
-     - **Input Fee**: Token cost for input processing.
-     - **Output Fee**: Token cost for output generation.
-
-   - **Schema**:
-     ```python
-     class OpenAIModelFee(BaseModel):
-         model_name: str = Field(..., description="Name of the OpenAI model.")
-         input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-         output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
-     ```
-
-   - **Example Code**:
-     ```python
-     async def extract_openai_pricing():
-         async with AsyncWebCrawler() as crawler:
-             result = await crawler.arun(
-                 url="https://openai.com/api/pricing/",
-                 extraction_strategy=LLMExtractionStrategy(
-                     provider="openai/gpt-4o",
-                     api_token=os.getenv("OPENAI_API_KEY"),
-                     schema=OpenAIModelFee.schema(),
-                     extraction_type="schema",
-                     instruction="Extract model names and fees for input and output tokens from the page."
-                 ),
-                 cache_mode=CacheMode.BYPASS
-             )
-             print(result.extracted_content)
-     ```
-
-   - **Explanation**:
-     - The extraction strategy combines a schema and detailed instruction to guide the LLM in capturing structured data.
-     - Each model’s name, input fee, and output fee are extracted in a JSON format.
-
-#### **4. Knowledge Graph Extraction Example**
-   - **Goal**: Extract entities and their relationships from a document for use in a knowledge graph.
-   - **Schema Definition**:
-     - **Entities**: Individual items with descriptions (e.g., people, organizations).
-     - **Relationships**: Connections between entities, including descriptions and relationship types.
-
-   - **Schema**:
-     ```python
-     class Entity(BaseModel):
-         name: str
-         description: str
-
-     class Relationship(BaseModel):
-         entity1: Entity
-         entity2: Entity
-         description: str
-         relation_type: str
-
-     class KnowledgeGraph(BaseModel):
-         entities: List[Entity]
-         relationships: List[Relationship]
-     ```
-
-   - **Example Code**:
-     ```python
-     async def extract_knowledge_graph():
-         extraction_strategy = LLMExtractionStrategy(
-             provider="azure/gpt-4o-mini",
-             api_token=os.getenv("AZURE_API_KEY"),
-             schema=KnowledgeGraph.schema(),
-             extraction_type="schema",
-             instruction="Extract entities and relationships from the content to build a knowledge graph."
-         )
-         async with AsyncWebCrawler() as crawler:
-             result = await crawler.arun(
-                 url="https://example.com/some-article",
-                 extraction_strategy=extraction_strategy,
-                 cache_mode=CacheMode.BYPASS
-             )
-             print(result.extracted_content)
-     ```
-
-   - **Explanation**:
-     - In this setup, the LLM extracts entities and their relationships based on the schema and instruction.
-     - The schema organizes results into a JSON-based knowledge graph format.
-
-#### **5. Key Settings in LLM Extraction**
-   - **Chunking Options**:
-     - For long pages, set `chunk_token_threshold` to specify maximum token count per section.
-     - Adjust `overlap_rate` to control the overlap between chunks, useful for contextual consistency.
-   - **Example**:
-     ```python
-     extraction_strategy = LLMExtractionStrategy(
-         provider="openai/gpt-4",
-         api_token=os.getenv("OPENAI_API_KEY"),
-         chunk_token_threshold=3000,
-         overlap_rate=0.2,  # 20% overlap between chunks
-         instruction="Extract key insights and relationships."
-     )
-     ```
-   - This setup ensures that longer texts are divided into manageable chunks with slight overlap, enhancing the quality of extraction.
-
-#### **6. Flexible Provider Options for LLM Extraction**
-   - **Using Proprietary Models**: OpenAI, Azure, and HuggingFace provide robust language models, often suited for complex or detailed extractions.
-   - **Using Open-Source Models**: Ollama and other open-source models can be deployed locally, suitable for offline or cost-effective extraction.
-   - **Example Call**:
-     ```python
-     await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
-     await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-     await extract_structured_data_using_llm("ollama/llama3.2")   
-     ```
-
-#### **7. Complete Example of LLM Extraction Setup**
-   - Code to run both the OpenAI pricing and Knowledge Graph extractions, using various providers:
-     ```python
-     async def main():
-         await extract_openai_pricing()
-         await extract_knowledge_graph()
-     
-     if __name__ == "__main__":
-         asyncio.run(main())
-     ```
-
-#### **8. Wrap Up & Next Steps**
-   - Recap the power of LLM extraction for handling unstructured or complex data extraction tasks.
-   - Tease the next video: **10.3 Cosine Similarity Strategy** for clustering similar content based on semantic similarity.
-
----
-
-This outline explains LLM Extraction in Crawl4AI, with examples showing how to extract structured data using custom schemas and instructions. It demonstrates flexibility with multiple providers, ensuring practical application for different use cases.
diff --git a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md
deleted file mode 100644
index 6100ae4c..00000000
--- a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md
+++ /dev/null
@@ -1,136 +0,0 @@
-# Crawl4AI
-
-## Episode 11: Extraction Strategies: JSON CSS, LLM, and Cosine
-
-### Quick Intro
-Introduce JSON CSS Extraction Strategy for structured data, LLM Extraction Strategy for intelligent parsing, and Cosine Strategy for clustering similar content. Demo: Use JSON CSS to scrape product details from an e-commerce site.
-
-Here’s a structured outline for the **Cosine Similarity Strategy** video, covering key concepts, configuration, and a practical example.
-
----
-
-### **10.3 Cosine Similarity Strategy**
-
-#### **1. Introduction to Cosine Similarity Strategy**
-   - The Cosine Similarity Strategy clusters content by semantic similarity, offering an efficient alternative to LLM-based extraction, especially when speed is a priority.
-   - Ideal for grouping similar sections of text, this strategy is well-suited for pages with content sections that may need to be classified or tagged, like news articles, product descriptions, or reviews.
-
-#### **2. Key Configuration Options**
-   - **semantic_filter**: A keyword-based filter to focus on relevant content.
-   - **word_count_threshold**: Minimum number of words per cluster, filtering out shorter, less meaningful clusters.
-   - **max_dist**: Maximum allowable distance between elements in clusters, impacting cluster tightness.
-   - **linkage_method**: Method for hierarchical clustering, such as `'ward'` (for well-separated clusters).
-   - **top_k**: Specifies the number of top categories for each cluster.
-   - **model_name**: Defines the model for embeddings, such as `sentence-transformers/all-MiniLM-L6-v2`.
-   - **sim_threshold**: Minimum similarity threshold for filtering, allowing control over cluster relevance.
-
-#### **3. How Cosine Similarity Clustering Works**
-   - **Step 1**: Embeddings are generated for each text section, transforming them into vectors that capture semantic meaning.
-   - **Step 2**: Hierarchical clustering groups similar sections based on cosine similarity, forming clusters with related content.
-   - **Step 3**: Clusters are filtered based on word count, removing those below the `word_count_threshold`.
-   - **Step 4**: Each cluster is then categorized with tags, if enabled, providing context to each grouped content section.
-
-#### **4. Example Use Case: Clustering Blog Article Sections**
-   - **Goal**: Group related sections of a blog or news page to identify distinct topics or discussion areas.
-   - **Example HTML Sections**:
-     ```text
-     "The economy is showing signs of recovery, with markets up this quarter.",
-     "In the sports world, several major teams are preparing for the upcoming season.",
-     "New advancements in AI technology are reshaping the tech landscape.",
-     "Market analysts are optimistic about continued growth in tech stocks."
-     ```
-
-   - **Code Setup**:
-     ```python
-     async def extract_blog_sections():
-         extraction_strategy = CosineStrategy(
-             word_count_threshold=15,
-             max_dist=0.3,
-             sim_threshold=0.2,
-             model_name="sentence-transformers/all-MiniLM-L6-v2",
-             top_k=2
-         )
-         async with AsyncWebCrawler() as crawler:
-             url = "https://example.com/blog-page"
-             result = await crawler.arun(
-                 url=url,
-                 extraction_strategy=extraction_strategy,
-                 cache_mode=CacheMode.BYPASS
-             )
-             print(result.extracted_content)
-     ```
-
-   - **Explanation**:
-     - **word_count_threshold**: Ensures only clusters with meaningful content are included.
-     - **sim_threshold**: Filters out clusters with low similarity, focusing on closely related sections.
-     - **top_k**: Selects top tags, useful for identifying main topics.
-
-#### **5. Applying Semantic Filtering with Cosine Similarity**
-   - **Semantic Filter**: Filters sections based on relevance to a specific keyword, such as “technology” for tech articles.
-   - **Example Code**:
-     ```python
-     extraction_strategy = CosineStrategy(
-         semantic_filter="technology",
-         word_count_threshold=10,
-         max_dist=0.25,
-         model_name="sentence-transformers/all-MiniLM-L6-v2"
-     )
-     ```
-   - **Explanation**:
-     - **semantic_filter**: Only sections with high similarity to the “technology” keyword will be included in the clustering, making it easy to focus on specific topics within a mixed-content page.
-
-#### **6. Clustering Product Reviews by Similarity**
-   - **Goal**: Organize product reviews by themes, such as “price,” “quality,” or “durability.”
-   - **Example Reviews**:
-     ```text
-     "The quality of this product is outstanding and well worth the price.",
-     "I found the product to be durable but a bit overpriced.",
-     "Great value for the money and long-lasting.",
-     "The build quality is good, but I expected a lower price point."
-     ```
-
-   - **Code Setup**:
-     ```python
-     async def extract_product_reviews():
-         extraction_strategy = CosineStrategy(
-             word_count_threshold=20,
-             max_dist=0.35,
-             sim_threshold=0.25,
-             model_name="sentence-transformers/all-MiniLM-L6-v2"
-         )
-         async with AsyncWebCrawler() as crawler:
-             url = "https://example.com/product-reviews"
-             result = await crawler.arun(
-                 url=url,
-                 extraction_strategy=extraction_strategy,
-                 cache_mode=CacheMode.BYPASS
-             )
-             print(result.extracted_content)
-     ```
-
-   - **Explanation**:
-     - This configuration clusters similar reviews, grouping feedback by common themes, helping businesses understand customer sentiments around particular product aspects.
-
-#### **7. Performance Advantages of Cosine Strategy**
-   - **Speed**: The Cosine Similarity Strategy is faster than LLM-based extraction, as it doesn’t rely on API calls to external LLMs.
-   - **Local Processing**: The strategy runs locally with pre-trained sentence embeddings, ideal for high-throughput scenarios where cost and latency are concerns.
-   - **Comparison**: With a well-optimized local model, this method can perform clustering on large datasets quickly, making it suitable for tasks requiring rapid, repeated analysis.
-
-#### **8. Full Code Example for Clustering News Articles**
-   - **Code**:
-     ```python
-     async def main():
-         await extract_blog_sections()
-         await extract_product_reviews()
-     
-     if __name__ == "__main__":
-         asyncio.run(main())
-     ```
-
-#### **9. Wrap Up & Next Steps**
-   - Recap the efficiency and effectiveness of Cosine Similarity for clustering related content quickly.
-   - Close with a reminder of Crawl4AI’s flexibility across extraction strategies, and prompt users to experiment with different settings to optimize clustering for their specific content.
-
----
-
-This outline covers Cosine Similarity Strategy’s speed and effectiveness, providing examples that showcase its potential for clustering various content types efficiently.
diff --git a/docs/md_v2/tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites.md b/docs/md_v2/tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites.md
deleted file mode 100644
index d1ab813d..00000000
--- a/docs/md_v2/tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites.md
+++ /dev/null
@@ -1,140 +0,0 @@
-# Crawl4AI
-
-## Episode 12: Session-Based Crawling for Dynamic Websites
-
-### Quick Intro
-Show session management for handling websites with multiple pages or actions (like “load more” buttons). Demo: Crawl a paginated content page, persisting session data across multiple requests.
-
-Here’s a detailed outline for the **Session-Based Crawling for Dynamic Websites** video, explaining why sessions are necessary, how to use them, and providing practical examples and a visual diagram to illustrate the concept.
-
----
-
-### **11. Session-Based Crawling for Dynamic Websites**
-
-#### **1. Introduction to Session-Based Crawling**
-   - **What is Session-Based Crawling**: Session-based crawling maintains a continuous browsing session across multiple page states, allowing the crawler to interact with a page and retrieve content that loads dynamically or based on user interactions.
-   - **Why It’s Needed**:
-     - In static pages, all content is available directly from a single URL.
-     - In dynamic websites, content often loads progressively or based on user actions (e.g., clicking “load more,” submitting forms, scrolling).
-     - Session-based crawling helps simulate user actions, capturing content that is otherwise hidden until specific actions are taken.
-
-#### **2. Conceptual Diagram for Session-Based Crawling**
-
-   ```mermaid
-   graph TD
-       Start[Start Session] --> S1[Initial State (S1)]
-       S1 -->|Crawl| Content1[Extract Content S1]
-       S1 -->|Action: Click Load More| S2[State S2]
-       S2 -->|Crawl| Content2[Extract Content S2]
-       S2 -->|Action: Scroll Down| S3[State S3]
-       S3 -->|Crawl| Content3[Extract Content S3]
-       S3 -->|Action: Submit Form| S4[Final State]
-       S4 -->|Crawl| Content4[Extract Content S4]
-       Content4 --> End[End Session]
-   ```
-
-   - **Explanation of Diagram**:
-     - **Start**: Initializes the session and opens the starting URL.
-     - **State Transitions**: Each action (e.g., clicking “load more,” scrolling) transitions to a new state, where additional content becomes available.
-     - **Session Persistence**: Keeps the same browsing session active, preserving the state and allowing for a sequence of actions to unfold.
-     - **End**: After reaching the final state, the session ends, and all accumulated content has been extracted.
-
-#### **3. Key Components of Session-Based Crawling in Crawl4AI**
-   - **Session ID**: A unique identifier to maintain the state across requests, allowing the crawler to “remember” previous actions.
-   - **JavaScript Execution**: Executes JavaScript commands (e.g., clicks, scrolls) to simulate interactions.
-   - **Wait Conditions**: Ensures the crawler waits for content to load in each state before moving on.
-   - **Sequential State Transitions**: By defining actions and wait conditions between states, the crawler can navigate through the page as a user would.
-
-#### **4. Basic Session Example: Multi-Step Content Loading**
-   - **Goal**: Crawl an article feed that requires several “load more” clicks to display additional content.
-   - **Code**:
-     ```python
-     async def crawl_article_feed():
-         async with AsyncWebCrawler() as crawler:
-             session_id = "feed_session"
-             
-             for page in range(3):
-                 result = await crawler.arun(
-                     url="https://example.com/articles",
-                     session_id=session_id,
-                     js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
-                     wait_for="css:.article",
-                     css_selector=".article"  # Target article elements
-                 )
-                 print(f"Page {page + 1}: Extracted {len(result.extracted_content)} articles")
-     ```
-   - **Explanation**:
-     - **session_id**: Ensures all requests share the same browsing state.
-     - **js_code**: Clicks the “load more” button after the initial page load, expanding content on each iteration.
-     - **wait_for**: Ensures articles have loaded after each click before extraction.
-
-#### **5. Advanced Example: E-Commerce Product Search with Filter Selection**
-   - **Goal**: Interact with filters on an e-commerce page to extract products based on selected criteria.
-   - **Example Steps**:
-     1. **State 1**: Load the main product page.
-     2. **State 2**: Apply a filter (e.g., “On Sale”) by selecting a checkbox.
-     3. **State 3**: Scroll to load additional products and capture updated results.
-
-   - **Code**:
-     ```python
-     async def extract_filtered_products():
-         async with AsyncWebCrawler() as crawler:
-             session_id = "product_session"
-             
-             # Step 1: Open product page
-             result = await crawler.arun(
-                 url="https://example.com/products",
-                 session_id=session_id,
-                 wait_for="css:.product-item"
-             )
-             
-             # Step 2: Apply filter (e.g., "On Sale")
-             result = await crawler.arun(
-                 url="https://example.com/products",
-                 session_id=session_id,
-                 js_code="document.querySelector('#sale-filter-checkbox').click();",
-                 wait_for="css:.product-item"
-             )
-
-             # Step 3: Scroll to load additional products
-             for _ in range(2):  # Scroll down twice
-                 result = await crawler.arun(
-                     url="https://example.com/products",
-                     session_id=session_id,
-                     js_code="window.scrollTo(0, document.body.scrollHeight);",
-                     wait_for="css:.product-item"
-                 )
-                 print(f"Loaded {len(result.extracted_content)} products after scroll")
-     ```
-   - **Explanation**:
-     - **State Persistence**: Each action (filter selection and scroll) builds on the previous session state.
-     - **Multiple Interactions**: Combines clicking a filter with scrolling, demonstrating how the session preserves these actions.
-
-#### **6. Key Benefits of Session-Based Crawling**
-   - **Accessing Hidden Content**: Retrieves data that loads only after user actions.
-   - **Simulating User Behavior**: Handles interactive elements such as “load more” buttons, dropdowns, and filters.
-   - **Maintaining Continuity Across States**: Enables a sequential process, moving logically from one state to the next, capturing all desired content without reloading the initial state each time.
-
-#### **7. Additional Configuration Tips**
-   - **Manage Session End**: Always conclude the session after the final state to release resources.
-   - **Optimize with Wait Conditions**: Use `wait_for` to ensure complete loading before each extraction.
-   - **Handling Errors in Session-Based Crawling**: Include error handling for interactions that may fail, ensuring robustness across state transitions.
-
-#### **8. Complete Code Example: Multi-Step Session Workflow**
-   - **Example**:
-     ```python
-     async def main():
-         await crawl_article_feed()
-         await extract_filtered_products()
-     
-     if __name__ == "__main__":
-         asyncio.run(main())
-     ```
-
-#### **9. Wrap Up & Next Steps**
-   - Recap the usefulness of session-based crawling for dynamic content extraction.
-   - Tease the next video: **Hooks and Custom Workflow with AsyncWebCrawler** to cover advanced customization options for further control over the crawling process.
-
----
-
-This outline covers session-based crawling from both a conceptual and practical perspective, helping users understand its importance, configure it effectively, and use it to handle complex dynamic content.
\ No newline at end of file
diff --git a/docs/md_v2/tutorial/episode_13_Chunking_Strategies_for_Large_Text_Processing.md b/docs/md_v2/tutorial/episode_13_Chunking_Strategies_for_Large_Text_Processing.md
deleted file mode 100644
index eda07e8b..00000000
--- a/docs/md_v2/tutorial/episode_13_Chunking_Strategies_for_Large_Text_Processing.md
+++ /dev/null
@@ -1,138 +0,0 @@
-# Crawl4AI
-
-## Episode 13: Chunking Strategies for Large Text Processing
-
-### Quick Intro
-Explain Regex, NLP, and Fixed-Length chunking, and when to use each. Demo: Chunk a large article or document for processing by topics or sentences.
-
-Here’s a structured outline for the **Chunking Strategies for Large Text Processing** video, emphasizing how chunking works within extraction and why it’s crucial for effective data aggregation.
-
-Here’s a structured outline for the **Chunking Strategies for Large Text Processing** video, explaining each strategy, when to use it, and providing examples to illustrate.
-
----
-
-### **12. Chunking Strategies for Large Text Processing**
-
-#### **1. Introduction to Chunking in Crawl4AI**
-   - **What is Chunking**: Chunking is the process of dividing large text into manageable sections or “chunks,” enabling efficient processing in extraction tasks.
-   - **Why It’s Needed**:
-     - When processing large text, feeding it directly into an extraction function (like `F(x)`) can overwhelm memory or token limits.
-     - Chunking breaks down `x` (the text) into smaller pieces, which are processed sequentially or in parallel by the extraction function, with the final result being an aggregation of all chunks’ processed output.
-
-#### **2. Key Chunking Strategies and Use Cases**
-   - Crawl4AI offers various chunking strategies to suit different text structures, chunk sizes, and processing requirements.
-   - **Choosing a Strategy**: Select based on the type of text (e.g., articles, transcripts) and extraction needs (e.g., simple splitting or context-sensitive processing).
-
-#### **3. Strategy 1: Regex-Based Chunking**
-   - **Description**: Uses regular expressions to split text based on specified patterns (e.g., paragraphs or section breaks).
-   - **Use Case**: Ideal for dividing text by paragraphs or larger logical blocks where sections are clearly separated by line breaks or punctuation.
-   - **Example**:
-     - **Pattern**: `r'\n\n'` for double line breaks.
-     ```python
-     chunker = RegexChunking(patterns=[r'\n\n'])
-     text_chunks = chunker.chunk(long_text)
-     print(text_chunks)  # Output: List of paragraphs
-     ```
-   - **Pros**: Flexible for pattern-based chunking.
-   - **Cons**: Limited to text with consistent formatting.
-
-#### **4. Strategy 2: NLP Sentence-Based Chunking**
-   - **Description**: Uses NLP to split text by sentences, ensuring grammatically complete segments.
-   - **Use Case**: Useful for extracting individual statements, such as in news articles, quotes, or legal text.
-   - **Example**:
-     ```python
-     chunker = NlpSentenceChunking()
-     sentence_chunks = chunker.chunk(long_text)
-     print(sentence_chunks)  # Output: List of sentences
-     ```
-   - **Pros**: Maintains sentence structure, ideal for tasks needing semantic completeness.
-   - **Cons**: May create very small chunks, which could limit contextual extraction.
-
-#### **5. Strategy 3: Topic-Based Segmentation Using TextTiling**
-   - **Description**: Segments text into topics using TextTiling, identifying topic shifts and key segments.
-   - **Use Case**: Ideal for long articles, reports, or essays where each section covers a different topic.
-   - **Example**:
-     ```python
-     chunker = TopicSegmentationChunking(num_keywords=3)
-     topic_chunks = chunker.chunk_with_topics(long_text)
-     print(topic_chunks)  # Output: List of topic segments with keywords
-     ```
-   - **Pros**: Groups related content, preserving topical coherence.
-   - **Cons**: Depends on identifiable topic shifts, which may not be present in all texts.
-
-#### **6. Strategy 4: Fixed-Length Word Chunking**
-   - **Description**: Splits text into chunks based on a fixed number of words.
-   - **Use Case**: Ideal for text where exact segment size is required, such as processing word-limited documents for LLMs.
-   - **Example**:
-     ```python
-     chunker = FixedLengthWordChunking(chunk_size=100)
-     word_chunks = chunker.chunk(long_text)
-     print(word_chunks)  # Output: List of 100-word chunks
-     ```
-   - **Pros**: Ensures uniform chunk sizes, suitable for token-based extraction limits.
-   - **Cons**: May split sentences, affecting semantic coherence.
-
-#### **7. Strategy 5: Sliding Window Chunking**
-   - **Description**: Uses a fixed window size with a step, creating overlapping chunks to maintain context.
-   - **Use Case**: Useful for maintaining context across sections, as with documents where context is needed for neighboring sections.
-   - **Example**:
-     ```python
-     chunker = SlidingWindowChunking(window_size=100, step=50)
-     window_chunks = chunker.chunk(long_text)
-     print(window_chunks)  # Output: List of overlapping word chunks
-     ```
-   - **Pros**: Retains context across adjacent chunks, ideal for complex semantic extraction.
-   - **Cons**: Overlap increases data size, potentially impacting processing time.
-
-#### **8. Strategy 6: Overlapping Window Chunking**
-   - **Description**: Similar to sliding windows but with a defined overlap, allowing chunks to share content at the edges.
-   - **Use Case**: Suitable for handling long texts with essential overlapping information, like research articles or medical records.
-   - **Example**:
-     ```python
-     chunker = OverlappingWindowChunking(window_size=1000, overlap=100)
-     overlap_chunks = chunker.chunk(long_text)
-     print(overlap_chunks)  # Output: List of overlapping chunks with defined overlap
-     ```
-   - **Pros**: Allows controlled overlap for consistent content coverage across chunks.
-   - **Cons**: Redundant data in overlapping areas may increase computation.
-
-#### **9. Practical Example: Using Chunking with an Extraction Strategy**
-   - **Goal**: Combine chunking with an extraction strategy to process large text effectively.
-   - **Example Code**:
-     ```python
-     from crawl4ai.extraction_strategy import LLMExtractionStrategy
-
-     async def extract_large_text():
-         # Initialize chunker and extraction strategy
-         chunker = FixedLengthWordChunking(chunk_size=200)
-         extraction_strategy = LLMExtractionStrategy(provider="openai/gpt-4", api_token="your_api_token")
-         
-         # Split text into chunks
-         text_chunks = chunker.chunk(large_text)
-         
-         async with AsyncWebCrawler() as crawler:
-             for chunk in text_chunks:
-                 result = await crawler.arun(
-                     url="https://example.com",
-                     extraction_strategy=extraction_strategy,
-                     content=chunk
-                 )
-                 print(result.extracted_content)
-     ```
-
-   - **Explanation**:
-     - `chunker.chunk()`: Divides the `large_text` into smaller segments based on the chosen strategy.
-     - `extraction_strategy`: Processes each chunk separately, and results are then aggregated to form the final output.
-
-#### **10. Choosing the Right Chunking Strategy**
-   - **Text Structure**: If text has clear sections (e.g., paragraphs, topics), use Regex or Topic Segmentation.
-   - **Extraction Needs**: If context is crucial, consider Sliding or Overlapping Window Chunking.
-   - **Processing Constraints**: For word-limited extractions (e.g., LLMs with token limits), Fixed-Length Word Chunking is often most effective.
-
-#### **11. Wrap Up & Next Steps**
-   - Recap the benefits of each chunking strategy and when to use them in extraction workflows.
-   - Tease the next video: **Hooks and Custom Workflow with AsyncWebCrawler**, focusing on customizing crawler behavior with hooks for a fine-tuned extraction process.
-
----
-
-This outline provides a complete understanding of chunking strategies, explaining each method’s strengths and best-use scenarios to help users process large texts effectively in Crawl4AI.
\ No newline at end of file
diff --git a/docs/md_v2/tutorial/episode_14_Hooks_and_Custom_Workflow_with_AsyncWebCrawler.md b/docs/md_v2/tutorial/episode_14_Hooks_and_Custom_Workflow_with_AsyncWebCrawler.md
deleted file mode 100644
index 87a3d217..00000000
--- a/docs/md_v2/tutorial/episode_14_Hooks_and_Custom_Workflow_with_AsyncWebCrawler.md
+++ /dev/null
@@ -1,185 +0,0 @@
-# Crawl4AI
-
-## Episode 14: Hooks and Custom Workflow with AsyncWebCrawler
-
-### Quick Intro
-Cover hooks (`on_browser_created`, `before_goto`, `after_goto`) to add custom workflows. Demo: Use hooks to add custom cookies or headers, log HTML, or trigger specific events on page load.
-
-Here’s a detailed outline for the **Hooks and Custom Workflow with AsyncWebCrawler** video, covering each hook’s purpose, usage, and example implementations.
-
----
-
-### **13. Hooks and Custom Workflow with AsyncWebCrawler**
-
-#### **1. Introduction to Hooks in Crawl4AI**
-   - **What are Hooks**: Hooks are customizable entry points in the crawling process that allow users to inject custom actions or logic at specific stages.
-   - **Why Use Hooks**:
-     - They enable fine-grained control over the crawling workflow.
-     - Useful for performing additional tasks (e.g., logging, modifying headers) dynamically during the crawl.
-     - Hooks provide the flexibility to adapt the crawler to complex site structures or unique project needs.
-
-#### **2. Overview of Available Hooks**
-   - Crawl4AI offers seven key hooks to modify and control different stages in the crawling lifecycle:
-     - `on_browser_created`
-     - `on_user_agent_updated`
-     - `on_execution_started`
-     - `before_goto`
-     - `after_goto`
-     - `before_return_html`
-     - `before_retrieve_html`
-
-#### **3. Hook-by-Hook Explanation and Examples**
-
----
-
-##### **Hook 1: `on_browser_created`**
-   - **Purpose**: Triggered right after the browser instance is created.
-   - **Use Case**:
-     - Initializing browser-specific settings or performing setup actions.
-     - Configuring browser extensions or scripts before any page is opened.
-   - **Example**:
-     ```python
-     async def log_browser_creation(browser):
-         print("Browser instance created:", browser)
-     
-     crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation)
-     ```
-   - **Explanation**: This hook logs the browser creation event, useful for tracking when a new browser instance starts.
-
----
-
-##### **Hook 2: `on_user_agent_updated`**
-   - **Purpose**: Called whenever the user agent string is updated.
-   - **Use Case**:
-     - Modifying the user agent based on page requirements, e.g., changing to a mobile user agent for mobile-only pages.
-   - **Example**:
-     ```python
-     def update_user_agent(user_agent):
-         print(f"User Agent Updated: {user_agent}")
-     
-     crawler.crawler_strategy.set_hook('on_user_agent_updated', update_user_agent)
-     crawler.update_user_agent("Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)")
-     ```
-   - **Explanation**: This hook provides a callback every time the user agent changes, helpful for debugging or dynamically altering user agent settings based on conditions.
-
----
-
-##### **Hook 3: `on_execution_started`**
-   - **Purpose**: Called right before the crawler begins any interaction (e.g., JavaScript execution, clicks).
-   - **Use Case**:
-     - Performing setup actions, such as inserting cookies or initiating custom scripts.
-   - **Example**:
-     ```python
-     async def log_execution_start(page):
-         print("Execution started on page:", page.url)
-     
-     crawler.crawler_strategy.set_hook('on_execution_started', log_execution_start)
-     ```
-   - **Explanation**: Logs the start of any major interaction on the page, ideal for cases where you want to monitor each interaction.
-
----
-
-##### **Hook 4: `before_goto`**
-   - **Purpose**: Triggered before navigating to a new URL with `page.goto()`.
-   - **Use Case**:
-     - Modifying request headers or setting up conditions right before the page loads.
-     - Adding headers or dynamically adjusting options for specific URLs.
-   - **Example**:
-     ```python
-     async def modify_headers_before_goto(page):
-         await page.set_extra_http_headers({"X-Custom-Header": "CustomValue"})
-         print("Custom headers set before navigation")
-     
-     crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto)
-     ```
-   - **Explanation**: This hook allows injecting headers or altering settings based on the page’s needs, particularly useful for pages with custom requirements.
-
----
-
-##### **Hook 5: `after_goto`**
-   - **Purpose**: Executed immediately after a page has loaded (after `page.goto()`).
-   - **Use Case**:
-     - Checking the loaded page state, modifying the DOM, or performing post-navigation actions (e.g., scrolling).
-   - **Example**:
-     ```python
-     async def post_navigation_scroll(page):
-         await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
-         print("Scrolled to the bottom after navigation")
-     
-     crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll)
-     ```
-   - **Explanation**: This hook scrolls to the bottom of the page after loading, which can help load dynamically added content like infinite scroll elements.
-
----
-
-##### **Hook 6: `before_return_html`**
-   - **Purpose**: Called right before HTML content is retrieved and returned.
-   - **Use Case**:
-     - Removing overlays or cleaning up the page for a cleaner HTML extraction.
-   - **Example**:
-     ```python
-     async def remove_advertisements(page, html):
-         await page.evaluate("document.querySelectorAll('.ad-banner').forEach(el => el.remove());")
-         print("Advertisements removed before returning HTML")
-     
-     crawler.crawler_strategy.set_hook('before_return_html', remove_advertisements)
-     ```
-   - **Explanation**: The hook removes ad banners from the HTML before it’s retrieved, ensuring a cleaner data extraction.
-
----
-
-##### **Hook 7: `before_retrieve_html`**
-   - **Purpose**: Runs right before Crawl4AI initiates HTML retrieval.
-   - **Use Case**:
-     - Finalizing any page adjustments (e.g., setting timers, waiting for specific elements).
-   - **Example**:
-     ```python
-     async def wait_for_content_before_retrieve(page):
-         await page.wait_for_selector('.main-content')
-         print("Main content loaded, ready to retrieve HTML")
-     
-     crawler.crawler_strategy.set_hook('before_retrieve_html', wait_for_content_before_retrieve)
-     ```
-   - **Explanation**: This hook waits for the main content to load before retrieving the HTML, ensuring that all essential content is captured.
-
-#### **4. Setting Hooks in Crawl4AI**
-   - **How to Set Hooks**:
-     - Use `set_hook` to define a custom function for each hook.
-     - Each hook function can be asynchronous (useful for actions like waiting or retrieving async data).
-   - **Example Setup**:
-     ```python
-     crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation)
-     crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto)
-     crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll)
-     ```
-
-#### **5. Complete Example: Using Hooks for a Customized Crawl Workflow**
-   - **Goal**: Log each key step, set custom headers before navigation, and clean up the page before retrieving HTML.
-   - **Example Code**:
-     ```python
-     async def custom_crawl():
-         async with AsyncWebCrawler() as crawler:
-             # Set hooks for custom workflow
-             crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation)
-             crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto)
-             crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll)
-             crawler.crawler_strategy.set_hook('before_return_html', remove_advertisements)
-             
-             # Perform the crawl
-             url = "https://example.com"
-             result = await crawler.arun(url=url)
-             print(result.html)  # Display or process HTML
-     ```
-
-#### **6. Benefits of Using Hooks in Custom Crawling Workflows**
-   - **Enhanced Control**: Hooks offer precise control over each stage, allowing adjustments based on content and structure.
-   - **Efficient Modifications**: Avoid reloading or restarting the session; hooks can alter actions dynamically.
-   - **Context-Sensitive Actions**: Hooks enable custom logic tailored to specific pages or sections, maximizing extraction quality.
-
-#### **7. Wrap Up & Next Steps**
-   - Recap how hooks empower customized workflows in Crawl4AI, enabling flexibility at every stage.
-   - Tease the next video: **Automating Post-Processing with Crawl4AI**, covering automated steps after data extraction.
-
----
-
-This outline provides a thorough understanding of hooks, their practical applications, and examples for customizing the crawling workflow in Crawl4AI.
\ No newline at end of file
diff --git a/docs/md_v2/tutorial/tutorial.md b/docs/md_v2/tutorial/tutorial.md
deleted file mode 100644
index 7bead842..00000000
--- a/docs/md_v2/tutorial/tutorial.md
+++ /dev/null
@@ -1,1789 +0,0 @@
-# Crawl4AI
-
-## Episode 1: Introduction to Crawl4AI and Basic Installation
-
-### Quick Intro
-Walk through installation from PyPI, setup, and verification. Show how to install with options like `torch` or `transformer` for advanced capabilities.
-
-Here's a condensed outline of the **Installation and Setup** video content:
-
----
-
-1) **Introduction to Crawl4AI**:
-
-   - Briefly explain that Crawl4AI is a powerful tool for web scraping, data extraction, and content processing, with customizable options for various needs.
-
-2) **Installation Overview**:
-
-   - **Basic Install**: Run `pip install crawl4ai` and `playwright install` (to set up browser dependencies).
-   - **Optional Advanced Installs**:
-     - `pip install crawl4ai[torch]` - Adds PyTorch for clustering.
-     - `pip install crawl4ai[transformer]` - Adds support for LLM-based extraction.
-     - `pip install crawl4ai[all]` - Installs all features for complete functionality.
-
-3) **Verifying the Installation**:
-
-   - Walk through a simple test script to confirm the setup:
-      ```python
-      import asyncio
-      from crawl4ai import AsyncWebCrawler, CacheMode
-      
-      async def main():
-          async with AsyncWebCrawler(verbose=True) as crawler:
-              result = await crawler.arun(url="https://www.example.com")
-              print(result.markdown[:500])  # Show first 500 characters
-
-      asyncio.run(main())
-      ```
-   - Explain that this script initializes the crawler and runs it on a test URL, displaying part of the extracted content to verify functionality.
-
-4) **Important Tips**:
-
-   - **Run** `playwright install` **after installation** to set up dependencies.
-   - **For full performance** on text-related tasks, run `crawl4ai-download-models` after installing with `[torch]`, `[transformer]`, or `[all]` options.
-   - If you encounter issues, refer to the documentation or GitHub issues.
-
-5) **Wrap Up**:
-
-   - Introduce the next topic in the series, which will cover Crawl4AI's browser configuration options (like choosing between `chromium`, `firefox`, and `webkit`).
-
----
-
-This structure provides a concise, effective guide to get viewers up and running with Crawl4AI in minutes.# Crawl4AI
-
-## Episode 2: Overview of Advanced Features
-
-### Quick Intro
-A general overview of advanced features like hooks, CSS selectors, and JSON CSS extraction.
-
-Here's a condensed outline for an **Overview of Advanced Features** video covering Crawl4AI's powerful customization and extraction options:
-
----
-
-### **Overview of Advanced Features**
-
-1) **Introduction to Advanced Features**:
-
-   - Briefly introduce Crawl4AI’s advanced tools, which let users go beyond basic crawling to customize and fine-tune their scraping workflows.
-
-2) **Taking Screenshots**:
-
-   - Explain the screenshot capability for capturing page state and verifying content.
-   - **Example**:
-      ```python
-      result = await crawler.arun(url="https://www.example.com", screenshot=True)
-      ```
-   - Mention that screenshots are saved as a base64 string in `result`, allowing easy decoding and saving.
-
-3) **Media and Link Extraction**:
-
-   - Demonstrate how to pull all media (images, videos) and links (internal and external) from a page for deeper analysis or content gathering.
-   - **Example**:
-      ```python
-      result = await crawler.arun(url="https://www.example.com")
-      print("Media:", result.media)
-      print("Links:", result.links)
-      ```
-
-4) **Custom User Agent**:
-
-   - Show how to set a custom user agent to disguise the crawler or simulate specific devices/browsers.
-   - **Example**:
-      ```python
-      result = await crawler.arun(url="https://www.example.com", user_agent="Mozilla/5.0 (compatible; MyCrawler/1.0)")
-      ```
-
-5) **Custom Hooks for Enhanced Control**:
-
-   - Briefly cover how to use hooks, which allow custom actions like setting headers or handling login during the crawl.
-   - **Example**: Setting a custom header with `before_get_url` hook.
-      ```python
-      async def before_get_url(page):
-          await page.set_extra_http_headers({"X-Test-Header": "test"})
-      ```
-
-6) **CSS Selectors for Targeted Extraction**:
-
-   - Explain the use of CSS selectors to extract specific elements, ideal for structured data like articles or product details.
-   - **Example**:
-      ```python
-      result = await crawler.arun(url="https://www.example.com", css_selector="h2")
-      print("H2 Tags:", result.extracted_content)
-      ```
-
-7) **Crawling Inside Iframes**:
-
-   - Mention how enabling `process_iframes=True` allows extracting content within iframes, useful for sites with embedded content or ads.
-   - **Example**:
-      ```python
-      result = await crawler.arun(url="https://www.example.com", process_iframes=True)
-      ```
-
-8) **Wrap-Up**:
-
-   - Summarize these advanced features and how they allow users to customize every part of their web scraping experience.
-   - Tease upcoming videos where each feature will be explored in detail.
-
----
-
-This covers each advanced feature with a brief example, providing a useful overview to prepare viewers for the more in-depth videos.# Crawl4AI
-
-## Episode 3: Browser Configurations & Headless Crawling
-
-### Quick Intro
-Explain browser options (`chromium`, `firefox`, `webkit`) and settings for headless mode, caching, and verbose logging.
-
-Here’s a streamlined outline for the **Browser Configurations & Headless Crawling** video:
-
----
-
-### **Browser Configurations & Headless Crawling**
-
-1) **Overview of Browser Options**:
-
-   - Crawl4AI supports three browser engines:
-     - **Chromium** (default) - Highly compatible.
-     - **Firefox** - Great for specialized use cases.
-     - **Webkit** - Lightweight, ideal for basic needs.
-   - **Example**:
-      ```python
-      # Using Chromium (default)
-      crawler = AsyncWebCrawler(browser_type="chromium")
-      
-      # Using Firefox
-      crawler = AsyncWebCrawler(browser_type="firefox")
-      
-      # Using WebKit
-      crawler = AsyncWebCrawler(browser_type="webkit")
-      ```
-
-2) **Headless Mode**:
-
-   - Headless mode runs the browser without a visible GUI, making it faster and less resource-intensive.
-   - To enable or disable:
-      ```python
-      # Headless mode (default is True)
-      crawler = AsyncWebCrawler(headless=True)
-      
-      # Disable headless mode for debugging
-      crawler = AsyncWebCrawler(headless=False)
-      ```
-
-3) **Verbose Logging**:
-
-   - Use `verbose=True` to get detailed logs for each action, useful for debugging:
-      ```python
-      crawler = AsyncWebCrawler(verbose=True)
-      ```
-
-4) **Running a Basic Crawl with Configuration**:
-
-   - Example of a simple crawl with custom browser settings:
-      ```python
-      async with AsyncWebCrawler(browser_type="firefox", headless=True, verbose=True) as crawler:
-          result = await crawler.arun(url="https://www.example.com")
-          print(result.markdown[:500])  # Show first 500 characters
-      ```
-   - This example uses Firefox in headless mode with logging enabled, demonstrating the flexibility of Crawl4AI’s setup.
-
-5) **Recap & Next Steps**:
-
-   - Recap the power of selecting different browsers and running headless mode for speed and efficiency.
-   - Tease the next video: **Proxy & Security Settings** for navigating blocked or restricted content and protecting IP identity.
-
----
-
-This breakdown covers browser configuration essentials in Crawl4AI, providing users with practical steps to optimize their scraping setup.# Crawl4AI
-
-## Episode 4: Advanced Proxy and Security Settings
-
-### Quick Intro
-Showcase proxy configurations (HTTP, SOCKS5, authenticated proxies). Demo: Use rotating proxies and set custom headers to avoid IP blocking and enhance security.
-
-Here’s a focused outline for the **Proxy and Security Settings** video:
-
----
-
-### **Proxy & Security Settings**
-
-1) **Why Use Proxies in Web Crawling**:
-
-   - Proxies are essential for bypassing IP-based restrictions, improving anonymity, and managing rate limits.
-   - Crawl4AI supports simple proxies, authenticated proxies, and proxy rotation for robust web scraping.
-
-2) **Basic Proxy Setup**:
-
-   - **Using a Simple Proxy**:
-     ```python
-     # HTTP proxy
-     crawler = AsyncWebCrawler(proxy="http://proxy.example.com:8080")
-     
-     # SOCKS proxy
-     crawler = AsyncWebCrawler(proxy="socks5://proxy.example.com:1080")
-     ```
-
-3) **Authenticated Proxies**:
-
-   - Use `proxy_config` for proxies requiring a username and password:
-     ```python
-     proxy_config = {
-         "server": "http://proxy.example.com:8080",
-         "username": "user",
-         "password": "pass"
-     }
-     crawler = AsyncWebCrawler(proxy_config=proxy_config)
-     ```
-
-4) **Rotating Proxies**:
-
-   - Rotating proxies helps avoid IP bans by switching IP addresses for each request:
-     ```python
-     async def get_next_proxy():
-         # Define proxy rotation logic here
-         return {"server": "http://next.proxy.com:8080"}
-     
-     async with AsyncWebCrawler() as crawler:
-         for url in urls:
-             proxy = await get_next_proxy()
-             crawler.update_proxy(proxy)
-             result = await crawler.arun(url=url)
-     ```
-   - This setup periodically switches the proxy for enhanced security and access.
-
-5) **Custom Headers for Additional Security**:
-
-   - Set custom headers to mask the crawler’s identity and avoid detection:
-     ```python
-     headers = {
-         "X-Forwarded-For": "203.0.113.195",
-         "Accept-Language": "en-US,en;q=0.9",
-         "Cache-Control": "no-cache",
-         "Pragma": "no-cache"
-     }
-     crawler = AsyncWebCrawler(headers=headers)
-     ```
-
-6) **Combining Proxies with Magic Mode for Anti-Bot Protection**:
-
-   - For sites with aggressive bot detection, combine `proxy` settings with `magic=True`:
-     ```python
-     async with AsyncWebCrawler(proxy="http://proxy.example.com:8080", headers={"Accept-Language": "en-US"}) as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             magic=True  # Enables anti-detection features
-         )
-     ```
-   - **Magic Mode** automatically enables user simulation, random timing, and browser property masking.
-
-7) **Wrap Up & Next Steps**:
-
-   - Summarize the importance of proxies and anti-detection in accessing restricted content and avoiding bans.
-   - Tease the next video: **JavaScript Execution and Handling Dynamic Content** for working with interactive and dynamically loaded pages.
-
----
-
-This outline provides a practical guide to setting up proxies and security configurations, empowering users to navigate restricted sites while staying undetected.# Crawl4AI
-
-## Episode 5: JavaScript Execution and Dynamic Content Handling
-
-### Quick Intro
-Explain JavaScript code injection with examples (e.g., simulating scrolling, clicking ‘load more’). Demo: Extract content from a page that uses dynamic loading with lazy-loaded images.
-
-Here’s a focused outline for the **JavaScript Execution and Dynamic Content Handling** video:
-
----
-
-### **JavaScript Execution & Dynamic Content Handling**
-
-1) **Why JavaScript Execution Matters**:
-
-   - Many modern websites load content dynamically via JavaScript, requiring special handling to access all elements.
-   - Crawl4AI can execute JavaScript on pages, enabling it to interact with elements like “load more” buttons, infinite scrolls, and content that appears only after certain actions.
-
-2) **Basic JavaScript Execution**:
-
-   - Use `js_code` to execute JavaScript commands on a page:
-     ```python
-     # Scroll to bottom of the page
-     result = await crawler.arun(
-         url="https://example.com",
-         js_code="window.scrollTo(0, document.body.scrollHeight);"
-     )
-     ```
-   - This command scrolls to the bottom, triggering any lazy-loaded or dynamically added content.
-
-3) **Multiple Commands & Simulating Clicks**:
-
-   - Combine multiple JavaScript commands to interact with elements like “load more” buttons:
-     ```python
-     js_commands = [
-         "window.scrollTo(0, document.body.scrollHeight);",
-         "document.querySelector('.load-more').click();"
-     ]
-     result = await crawler.arun(
-         url="https://example.com",
-         js_code=js_commands
-     )
-     ```
-   - This script scrolls down and then clicks the “load more” button, useful for loading additional content blocks.
-
-4) **Waiting for Dynamic Content**:
-
-   - Use `wait_for` to ensure the page loads specific elements before proceeding:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         js_code="window.scrollTo(0, document.body.scrollHeight);",
-         wait_for="css:.dynamic-content"  # Wait for elements with class `.dynamic-content`
-     )
-     ```
-   - This example waits until elements with `.dynamic-content` are loaded, helping to capture content that appears after JavaScript actions.
-
-5) **Handling Complex Dynamic Content (e.g., Infinite Scroll)**:
-
-   - Combine JavaScript execution with conditional waiting to handle infinite scrolls or paginated content:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         js_code=[
-             "window.scrollTo(0, document.body.scrollHeight);",
-             "const loadMore = document.querySelector('.load-more'); if (loadMore) loadMore.click();"
-         ],
-         wait_for="js:() => document.querySelectorAll('.item').length > 10"  # Wait until 10 items are loaded
-     )
-     ```
-   - This example scrolls and clicks "load more" repeatedly, waiting each time for a specified number of items to load.
-
-6) **Complete Example: Dynamic Content Handling with Extraction**:
-
-   - Full example demonstrating a dynamic load and content extraction in one process:
-     ```python
-     async with AsyncWebCrawler() as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             js_code=[
-                 "window.scrollTo(0, document.body.scrollHeight);",
-                 "document.querySelector('.load-more').click();"
-             ],
-             wait_for="css:.main-content",
-             css_selector=".main-content"
-         )
-         print(result.markdown[:500])  # Output the main content extracted
-     ```
-
-7) **Wrap Up & Next Steps**:
-
-   - Recap how JavaScript execution allows access to dynamic content, enabling powerful interactions.
-   - Tease the next video: **Content Cleaning and Fit Markdown** to show how Crawl4AI can extract only the most relevant content from complex pages.
-
----
-
-This outline explains how to handle dynamic content and JavaScript-based interactions effectively, enabling users to scrape and interact with complex, modern websites.# Crawl4AI
-
-## Episode 6: Magic Mode and Anti-Bot Protection
-
-### Quick Intro
-Highlight `Magic Mode` and anti-bot features like user simulation, navigator overrides, and timing randomization. Demo: Access a site with anti-bot protection and show how `Magic Mode` seamlessly handles it.
-
-Here’s a concise outline for the **Magic Mode and Anti-Bot Protection** video:
-
----
-
-### **Magic Mode & Anti-Bot Protection**
-
-1) **Why Anti-Bot Protection is Important**:
-
-   - Many websites use bot detection mechanisms to block automated scraping. Crawl4AI’s anti-detection features help avoid IP bans, CAPTCHAs, and access restrictions.
-   - **Magic Mode** is a one-step solution to enable a range of anti-bot features without complex configuration.
-
-2) **Enabling Magic Mode**:
-
-   - Simply set `magic=True` to activate Crawl4AI’s full anti-bot suite:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         magic=True  # Enables all anti-detection features
-     )
-     ```
-   - This enables a blend of stealth techniques, including masking automation signals, randomizing timings, and simulating real user behavior.
-
-3) **What Magic Mode Does Behind the Scenes**:
-
-   - **User Simulation**: Mimics human actions like mouse movements and scrolling.
-   - **Navigator Overrides**: Hides signals that indicate an automated browser.
-   - **Timing Randomization**: Adds random delays to simulate natural interaction patterns.
-   - **Cookie Handling**: Accepts and manages cookies dynamically to avoid triggers from cookie pop-ups.
-
-4) **Manual Anti-Bot Options (If Not Using Magic Mode)**:
-
-   - For granular control, you can configure individual settings without Magic Mode:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         simulate_user=True,        # Enables human-like behavior
-         override_navigator=True    # Hides automation fingerprints
-     )
-     ```
-   - **Use Cases**: This approach allows more specific adjustments when certain anti-bot features are needed but others are not.
-
-5) **Combining Proxies with Magic Mode**:
-
-   - To avoid rate limits or IP blocks, combine Magic Mode with a proxy:
-     ```python
-     async with AsyncWebCrawler(
-         proxy="http://proxy.example.com:8080",
-         headers={"Accept-Language": "en-US"}
-     ) as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             magic=True  # Full anti-detection
-         )
-     ```
-   - This setup maximizes stealth by pairing anti-bot detection with IP obfuscation.
-
-6) **Example of Anti-Bot Protection in Action**:
-
-   - Full example with Magic Mode and proxies to scrape a protected page:
-     ```python
-     async with AsyncWebCrawler() as crawler:
-         result = await crawler.arun(
-             url="https://example.com/protected-content",
-             magic=True,
-             proxy="http://proxy.example.com:8080",
-             wait_for="css:.content-loaded"  # Wait for the main content to load
-         )
-         print(result.markdown[:500])  # Display first 500 characters of the content
-     ```
-   - This example ensures seamless access to protected content by combining anti-detection and waiting for full content load.
-
-7) **Wrap Up & Next Steps**:
-
-   - Recap the power of Magic Mode and anti-bot features for handling restricted websites.
-   - Tease the next video: **Content Cleaning and Fit Markdown** to show how to extract clean and focused content from a page.
-
----
-
-This outline shows users how to easily avoid bot detection and access restricted content, demonstrating both the power and simplicity of Magic Mode in Crawl4AI.# Crawl4AI
-
-## Episode 7: Content Cleaning and Fit Markdown
-
-### Quick Intro
-Explain content cleaning options, including `fit_markdown` to keep only the most relevant content. Demo: Extract and compare regular vs. fit markdown from a news site or blog.
-
-Here’s a streamlined outline for the **Content Cleaning and Fit Markdown** video:
-
----
-
-### **Content Cleaning & Fit Markdown**
-
-1) **Overview of Content Cleaning in Crawl4AI**:
-
-   - Explain that web pages often include extra elements like ads, navigation bars, footers, and popups.
-   - Crawl4AI’s content cleaning features help extract only the main content, reducing noise and enhancing readability.
-
-2) **Basic Content Cleaning Options**:
-
-   - **Removing Unwanted Elements**: Exclude specific HTML tags, like forms or navigation bars:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         word_count_threshold=10,       # Filter out blocks with fewer than 10 words
-         excluded_tags=['form', 'nav'], # Exclude specific tags
-         remove_overlay_elements=True   # Remove popups and modals
-     )
-     ```
-   - This example extracts content while excluding forms, navigation, and modal overlays, ensuring clean results.
-
-3) **Fit Markdown for Main Content Extraction**:
-
-   - **What is Fit Markdown**: Uses advanced analysis to identify the most relevant content (ideal for articles, blogs, and documentation).
-   - **How it Works**: Analyzes content density, removes boilerplate elements, and maintains formatting for a clear output.
-   - **Example**:
-     ```python
-     result = await crawler.arun(url="https://example.com")
-     main_content = result.fit_markdown  # Extracted main content
-     print(main_content[:500])  # Display first 500 characters
-     ```
-   - Fit Markdown is especially helpful for long-form content like news articles or blog posts.
-
-4) **Comparing Fit Markdown with Regular Markdown**:
-
-   - **Fit Markdown** returns the primary content without extraneous elements.
-   - **Regular Markdown** includes all extracted text in markdown format.
-   - Example to show the difference:
-     ```python
-     all_content = result.markdown      # Full markdown
-     main_content = result.fit_markdown # Only the main content
-     
-     print(f"All Content Length: {len(all_content)}")
-     print(f"Main Content Length: {len(main_content)}")
-     ```
-   - This comparison shows the effectiveness of Fit Markdown in focusing on essential content.
-
-5) **Media and Metadata Handling with Content Cleaning**:
-
-   - **Media Extraction**: Crawl4AI captures images and videos with metadata like alt text, descriptions, and relevance scores:
-     ```python
-     for image in result.media["images"]:
-         print(f"Source: {image['src']}, Alt Text: {image['alt']}, Relevance Score: {image['score']}")
-     ```
-   - **Use Case**: Useful for saving only relevant images or videos from an article or content-heavy page.
-
-6) **Example of Clean Content Extraction in Action**:
-
-   - Full example extracting cleaned content and Fit Markdown:
-     ```python
-     async with AsyncWebCrawler() as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             word_count_threshold=10,
-             excluded_tags=['nav', 'footer'],
-             remove_overlay_elements=True
-         )
-         print(result.fit_markdown[:500])  # Show main content
-     ```
-   - This example demonstrates content cleaning with settings for filtering noise and focusing on the core text.
-
-7) **Wrap Up & Next Steps**:
-
-   - Summarize the power of Crawl4AI’s content cleaning features and Fit Markdown for capturing clean, relevant content.
-   - Tease the next video: **Link Analysis and Smart Filtering** to focus on analyzing and filtering links within crawled pages.
-
----
-
-This outline covers Crawl4AI’s content cleaning features and the unique benefits of Fit Markdown, showing users how to retrieve focused, high-quality content from web pages.# Crawl4AI
-
-## Episode 8: Media Handling: Images, Videos, and Audio
-
-### Quick Intro
-Showcase Crawl4AI’s media extraction capabilities, including lazy-loaded media and metadata. Demo: Crawl a multimedia page, extract images, and show metadata (alt text, context, relevance score).
-
-Here’s a clear and focused outline for the **Media Handling: Images, Videos, and Audio** video:
-
----
-
-### **Media Handling: Images, Videos, and Audio**
-
-1) **Overview of Media Extraction in Crawl4AI**:
-
-   - Crawl4AI can detect and extract different types of media (images, videos, and audio) along with useful metadata.
-   - This functionality is essential for gathering visual content from multimedia-heavy pages like e-commerce sites, news articles, and social media feeds.
-
-2) **Image Extraction and Metadata**:
-
-   - Crawl4AI captures images with detailed metadata, including:
-     - **Source URL**: The direct URL to the image.
-     - **Alt Text**: Image description if available.
-     - **Relevance Score**: A score (0–10) indicating how relevant the image is to the main content.
-     - **Context**: Text surrounding the image on the page.
-   - **Example**:
-     ```python
-     result = await crawler.arun(url="https://example.com")
-     
-     for image in result.media["images"]:
-         print(f"Source: {image['src']}")
-         print(f"Alt Text: {image['alt']}")
-         print(f"Relevance Score: {image['score']}")
-         print(f"Context: {image['context']}")
-     ```
-   - This example shows how to access each image’s metadata, making it easy to filter for the most relevant visuals.
-
-3) **Handling Lazy-Loaded Images**:
-
-   - Crawl4AI automatically supports lazy-loaded images, which are commonly used to optimize webpage loading.
-   - **Example with Wait for Lazy-Loaded Content**:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         wait_for="css:img[data-src]",  # Wait for lazy-loaded images
-         delay_before_return_html=2.0   # Allow extra time for images to load
-     )
-     ```
-   - This setup waits for lazy-loaded images to appear, ensuring they are fully captured.
-
-4) **Video Extraction and Metadata**:
-
-   - Crawl4AI captures video elements, including:
-     - **Source URL**: The video’s direct URL.
-     - **Type**: Format of the video (e.g., MP4).
-     - **Thumbnail**: A poster or thumbnail image if available.
-     - **Duration**: Video length, if metadata is provided.
-   - **Example**:
-     ```python
-     for video in result.media["videos"]:
-         print(f"Video Source: {video['src']}")
-         print(f"Type: {video['type']}")
-         print(f"Thumbnail: {video.get('poster')}")
-         print(f"Duration: {video.get('duration')}")
-     ```
-   - This allows users to gather video content and relevant details for further processing or analysis.
-
-5) **Audio Extraction and Metadata**:
-
-   - Audio elements can also be extracted, with metadata like:
-     - **Source URL**: The audio file’s direct URL.
-     - **Type**: Format of the audio file (e.g., MP3).
-     - **Duration**: Length of the audio, if available.
-   - **Example**:
-     ```python
-     for audio in result.media["audios"]:
-         print(f"Audio Source: {audio['src']}")
-         print(f"Type: {audio['type']}")
-         print(f"Duration: {audio.get('duration')}")
-     ```
-   - Useful for sites with podcasts, sound bites, or other audio content.
-
-6) **Filtering Media by Relevance**:
-
-   - Use metadata like relevance score to filter only the most useful media content:
-     ```python
-     relevant_images = [img for img in result.media["images"] if img['score'] > 5]
-     ```
-   - This is especially helpful for content-heavy pages where you only want media directly related to the main content.
-
-7) **Example: Full Media Extraction with Content Filtering**:
-
-   - Full example extracting images, videos, and audio along with filtering by relevance:
-     ```python
-     async with AsyncWebCrawler() as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             word_count_threshold=10,  # Filter content blocks for relevance
-             exclude_external_images=True  # Only keep internal images
-         )
-         
-         # Display media summaries
-         print(f"Relevant Images: {len(relevant_images)}")
-         print(f"Videos: {len(result.media['videos'])}")
-         print(f"Audio Clips: {len(result.media['audios'])}")
-     ```
-   - This example shows how to capture and filter various media types, focusing on what’s most relevant.
-
-8) **Wrap Up & Next Steps**:
-
-   - Recap the comprehensive media extraction capabilities, emphasizing how metadata helps users focus on relevant content.
-   - Tease the next video: **Link Analysis and Smart Filtering** to explore how Crawl4AI handles internal, external, and social media links for more focused data gathering.
-
----
-
-This outline provides users with a complete guide to handling images, videos, and audio in Crawl4AI, using metadata to enhance relevance and precision in multimedia extraction.# Crawl4AI
-
-## Episode 9: Link Analysis and Smart Filtering
-
-### Quick Intro
-Walk through internal and external link classification, social media link filtering, and custom domain exclusion. Demo: Analyze links on a website, focusing on internal navigation vs. external or ad links.
-
-Here’s a focused outline for the **Link Analysis and Smart Filtering** video:
-
----
-
-### **Link Analysis & Smart Filtering**
-
-1) **Importance of Link Analysis in Web Crawling**:
-
-   - Explain that web pages often contain numerous links, including internal links, external links, social media links, and ads.
-   - Crawl4AI’s link analysis and filtering options help extract only relevant links, enabling more targeted and efficient crawls.
-
-2) **Automatic Link Classification**:
-
-   - Crawl4AI categorizes links automatically into internal, external, and social media links.
-   - **Example**:
-     ```python
-     result = await crawler.arun(url="https://example.com")
-
-     # Access internal and external links
-     internal_links = result.links["internal"]
-     external_links = result.links["external"]
-
-     # Print first few links for each type
-     print("Internal Links:", internal_links[:3])
-     print("External Links:", external_links[:3])
-     ```
-
-3) **Filtering Out Unwanted Links**:
-
-   - **Exclude External Links**: Remove all links pointing to external sites.
-   - **Exclude Social Media Links**: Filter out social media domains like Facebook or Twitter.
-   - **Example**:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         exclude_external_links=True,         # Remove external links
-         exclude_social_media_links=True      # Remove social media links
-     )
-     ```
-
-4) **Custom Domain Filtering**:
-
-   - **Exclude Specific Domains**: Filter links from particular domains, e.g., ad sites.
-   - **Custom Social Media Domains**: Add additional social media domains if needed.
-   - **Example**:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         exclude_domains=["ads.com", "trackers.com"],
-         exclude_social_media_domains=["facebook.com", "linkedin.com"]
-     )
-     ```
-
-5) **Accessing Link Context and Metadata**:
-
-   - Crawl4AI provides additional metadata for each link, including its text, type (e.g., navigation or content), and surrounding context.
-   - **Example**:
-     ```python
-     for link in result.links["internal"]:
-         print(f"Link: {link['href']}, Text: {link['text']}, Context: {link['context']}")
-     ```
-   - **Use Case**: Helps users understand the relevance of links based on where they are placed on the page (e.g., navigation vs. article content).
-
-6) **Example of Comprehensive Link Filtering and Analysis**:
-
-   - Full example combining link filtering, metadata access, and contextual information:
-     ```python
-     async with AsyncWebCrawler() as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             exclude_external_links=True,
-             exclude_social_media_links=True,
-             exclude_domains=["ads.com"],
-             css_selector=".main-content"  # Focus only on main content area
-         )
-         for link in result.links["internal"]:
-             print(f"Internal Link: {link['href']}, Text: {link['text']}, Context: {link['context']}")
-     ```
-   - This example filters unnecessary links, keeping only internal and relevant links from the main content area.
-
-7) **Wrap Up & Next Steps**:
-
-   - Summarize the benefits of link filtering for efficient crawling and relevant content extraction.
-   - Tease the next video: **Custom Headers, Identity Management, and User Simulation** to explain how to configure identity settings and simulate user behavior for stealthier crawls.
-
----
-
-This outline provides a practical overview of Crawl4AI’s link analysis and filtering features, helping users target only essential links while eliminating distractions.# Crawl4AI
-
-## Episode 10: Custom Headers, Identity, and User Simulation
-
-### Quick Intro
-Teach how to use custom headers, user-agent strings, and simulate real user interactions. Demo: Set custom user-agent and headers to access a site that blocks typical crawlers.
-
-Here’s a concise outline for the **Custom Headers, Identity Management, and User Simulation** video:
-
----
-
-### **Custom Headers, Identity Management, & User Simulation**
-
-1) **Why Customize Headers and Identity in Crawling**:
-
-   - Websites often track request headers and browser properties to detect bots. Customizing headers and managing identity help make requests appear more human, improving access to restricted sites.
-
-2) **Setting Custom Headers**:
-
-   - Customize HTTP headers to mimic genuine browser requests or meet site-specific requirements:
-     ```python
-     headers = {
-         "Accept-Language": "en-US,en;q=0.9",
-         "X-Requested-With": "XMLHttpRequest",
-         "Cache-Control": "no-cache"
-     }
-     crawler = AsyncWebCrawler(headers=headers)
-     ```
-   - **Use Case**: Customize the `Accept-Language` header to simulate local user settings, or `Cache-Control` to bypass cache for fresh content.
-
-3) **Setting a Custom User Agent**:
-
-   - Some websites block requests from common crawler user agents. Setting a custom user agent string helps bypass these restrictions:
-     ```python
-     crawler = AsyncWebCrawler(
-         user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-     )
-     ```
-   - **Tip**: Use user-agent strings from popular browsers (e.g., Chrome, Firefox) to improve access and reduce detection risks.
-
-4) **User Simulation for Human-like Behavior**:
-
-   - Enable `simulate_user=True` to mimic natural user interactions, such as random timing and simulated mouse movements:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         simulate_user=True  # Simulates human-like behavior
-     )
-     ```
-   - **Behavioral Effects**: Adds subtle variations in interactions, making the crawler harder to detect on bot-protected sites.
-
-5) **Navigator Overrides and Magic Mode for Full Identity Masking**:
-
-   - Use `override_navigator=True` to mask automation indicators like `navigator.webdriver`, which websites check to detect bots:
-     ```python
-     result = await crawler.arun(
-         url="https://example.com",
-         override_navigator=True  # Masks bot-related signals
-     )
-     ```
-   - **Combining with Magic Mode**: For a complete anti-bot setup, combine these identity options with `magic=True` for maximum protection:
-     ```python
-     async with AsyncWebCrawler() as crawler:
-         result = await crawler.arun(
-             url="https://example.com",
-             magic=True,  # Enables all anti-bot detection features
-             user_agent="Custom-Agent",  # Custom agent with Magic Mode
-         )
-     ```
-   - This setup includes all anti-detection techniques like navigator masking, random timing, and user simulation.
-
-6) **Example: Comprehensive Setup for Identity Management**:
-
-   - A full example combining custom headers, user-agent, and user simulation for a realistic browsing profile:
-     ```python
-     async with AsyncWebCrawler(
-         headers={"Accept-Language": "en-US", "Cache-Control": "no-cache"},
-         user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0",
-     ) as crawler:
-         result = await crawler.arun(
-            url="https://example.com/secure-page",
-            simulate_user=True
-        )
-         print(result.markdown[:500])  # Display extracted content
-     ```
-   - This example enables detailed customization for evading detection and accessing protected pages smoothly.
-
-7) **Wrap Up & Next Steps**:
-
-   - Recap the value of headers, user-agent customization, and simulation in bypassing bot detection.
-   - Tease the next video: **Extraction Strategies: JSON CSS, LLM, and Cosine** to dive into structured data extraction methods for high-quality content retrieval.
-
----
-
-This outline equips users with tools for managing crawler identity and human-like behavior, essential for accessing bot-protected or restricted websites.Here’s a detailed outline for the **JSON-CSS Extraction Strategy** video, covering all key aspects and supported structures in Crawl4AI:
-
----
-
-### **10.1 JSON-CSS Extraction Strategy**
-
-#### **1. Introduction to JSON-CSS Extraction**
-   - JSON-CSS Extraction is used for pulling structured data from pages with repeated patterns, like product listings, article feeds, or directories.
-   - This strategy allows defining a schema with CSS selectors and data fields, making it easy to capture nested, list-based, or singular elements.
-
-#### **2. Basic Schema Structure**
-   - **Schema Fields**: The schema has two main components:
-     - `baseSelector`: A CSS selector to locate the main elements you want to extract (e.g., each article or product block).
-     - `fields`: Defines the data fields for each element, supporting various data types and structures.
-
-#### **3. Simple Field Extraction**
-   - **Example HTML**:
-     ```html
-     <div class="product">
-         <h2 class="title">Sample Product</h2>
-         <span class="price">$19.99</span>
-         <p class="description">This is a sample product.</p>
-     </div>
-     ```
-   - **Schema**:
-     ```python
-     schema = {
-         "baseSelector": ".product",
-         "fields": [
-             {"name": "title", "selector": ".title", "type": "text"},
-             {"name": "price", "selector": ".price", "type": "text"},
-             {"name": "description", "selector": ".description", "type": "text"}
-         ]
-     }
-     ```
-   - **Explanation**: Each field captures text content from specified CSS selectors within each `.product` element.
-
-#### **4. Supported Field Types: Text, Attribute, HTML, Regex**
-   - **Field Type Options**:
-     - `text`: Extracts visible text.
-     - `attribute`: Captures an HTML attribute (e.g., `src`, `href`).
-     - `html`: Extracts the raw HTML of an element.
-     - `regex`: Allows regex patterns to extract part of the text.
-
-   - **Example HTML** (including an image):
-     ```html
-     <div class="product">
-         <h2 class="title">Sample Product</h2>
-         <img class="product-image" src="image.jpg" alt="Product Image">
-         <span class="price">$19.99</span>
-         <p class="description">Limited time offer.</p>
-     </div>
-     ```
-   - **Schema**:
-     ```python
-     schema = {
-         "baseSelector": ".product",
-         "fields": [
-             {"name": "title", "selector": ".title", "type": "text"},
-             {"name": "image_url", "selector": ".product-image", "type": "attribute", "attribute": "src"},
-             {"name": "price", "selector": ".price", "type": "regex", "pattern": r"\$(\d+\.\d+)"},
-             {"name": "description_html", "selector": ".description", "type": "html"}
-         ]
-     }
-     ```
-   - **Explanation**:
-     - `attribute`: Extracts the `src` attribute from `.product-image`.
-     - `regex`: Extracts the numeric part from `$19.99`.
-     - `html`: Retrieves the full HTML of the description element.
-
-#### **5. Nested Field Extraction**
-   - **Use Case**: Useful when content contains sub-elements, such as an article with author details within it.
-   - **Example HTML**:
-     ```html
-     <div class="article">
-         <h1 class="title">Sample Article</h1>
-         <div class="author">
-             <span class="name">John Doe</span>
-             <span class="bio">Writer and editor</span>
-         </div>
-     </div>
-     ```
-   - **Schema**:
-     ```python
-     schema = {
-         "baseSelector": ".article",
-         "fields": [
-             {"name": "title", "selector": ".title", "type": "text"},
-             {"name": "author", "type": "nested", "selector": ".author", "fields": [
-                 {"name": "name", "selector": ".name", "type": "text"},
-                 {"name": "bio", "selector": ".bio", "type": "text"}
-             ]}
-         ]
-     }
-     ```
-   - **Explanation**:
-     - `nested`: Extracts `name` and `bio` within `.author`, grouping the author details in a single `author` object.
-
-#### **6. List and Nested List Extraction**
-   - **List**: Extracts multiple elements matching the selector as a list.
-   - **Nested List**: Allows lists within lists, useful for items with sub-lists (e.g., specifications for each product).
-   - **Example HTML**:
-     ```html
-     <div class="product">
-         <h2 class="title">Product with Features</h2>
-         <ul class="features">
-             <li class="feature">Feature 1</li>
-             <li class="feature">Feature 2</li>
-             <li class="feature">Feature 3</li>
-         </ul>
-     </div>
-     ```
-   - **Schema**:
-     ```python
-     schema = {
-         "baseSelector": ".product",
-         "fields": [
-             {"name": "title", "selector": ".title", "type": "text"},
-             {"name": "features", "type": "list", "selector": ".features .feature", "fields": [
-                 {"name": "feature", "type": "text"}
-             ]}
-         ]
-     }
-     ```
-   - **Explanation**:
-     - `list`: Captures each `.feature` item within `.features`, outputting an array of features under the `features` field.
-
-#### **7. Transformations for Field Values**
-   - Transformations allow you to modify extracted values (e.g., converting to lowercase).
-   - Supported transformations: `lowercase`, `uppercase`, `strip`.
-   - **Example HTML**:
-     ```html
-     <div class="product">
-         <h2 class="title">Special Product</h2>
-     </div>
-     ```
-   - **Schema**:
-     ```python
-     schema = {
-         "baseSelector": ".product",
-         "fields": [
-             {"name": "title", "selector": ".title", "type": "text", "transform": "uppercase"}
-         ]
-     }
-     ```
-   - **Explanation**: The `transform` property changes the `title` to uppercase, useful for standardized outputs.
-
-#### **8. Full JSON-CSS Extraction Example**
-   - Combining all elements in a single schema example for a comprehensive crawl:
-   - **Example HTML**:
-     ```html
-     <div class="product">
-         <h2 class="title">Featured Product</h2>
-         <img class="product-image" src="product.jpg">
-         <span class="price">$99.99</span>
-         <p class="description">Best product of the year.</p>
-         <ul class="features">
-             <li class="feature">Durable</li>
-             <li class="feature">Eco-friendly</li>
-         </ul>
-     </div>
-     ```
-   - **Schema**:
-     ```python
-     schema = {
-         "baseSelector": ".product",
-         "fields": [
-             {"name": "title", "selector": ".title", "type": "text", "transform": "uppercase"},
-             {"name": "image_url", "selector": ".product-image", "type": "attribute", "attribute": "src"},
-             {"name": "price", "selector": ".price", "type": "regex", "pattern": r"\$(\d+\.\d+)"},
-             {"name": "description", "selector": ".description", "type": "html"},
-             {"name": "features", "type": "list", "selector": ".features .feature", "fields": [
-                 {"name": "feature", "type": "text"}
-             ]}
-         ]
-     }
-     ```
-   - **Explanation**: This schema captures and transforms each aspect of the product, illustrating the JSON-CSS strategy’s versatility for structured extraction.
-
-#### **9. Wrap Up & Next Steps**
-   - Summarize JSON-CSS Extraction’s flexibility for structured, pattern-based extraction.
-   - Tease the next video: **10.2 LLM Extraction Strategy**, focusing on using language models to extract data based on intelligent content analysis.
-
----
-
-This outline covers each JSON-CSS Extraction option in Crawl4AI, with practical examples and schema configurations, making it a thorough guide for users.# Crawl4AI
-
-## Episode 11: Extraction Strategies: JSON CSS, LLM, and Cosine
-
-### Quick Intro
-Introduce JSON CSS Extraction Strategy for structured data, LLM Extraction Strategy for intelligent parsing, and Cosine Strategy for clustering similar content. Demo: Use JSON CSS to scrape product details from an e-commerce site.
-
-Here’s a comprehensive outline for the **LLM Extraction Strategy** video, covering key details and example applications.
-
----
-
-### **10.2 LLM Extraction Strategy**
-
-#### **1. Introduction to LLM Extraction Strategy**
-   - The LLM Extraction Strategy leverages language models to interpret and extract structured data from complex web content.
-   - Unlike traditional CSS selectors, this strategy uses natural language instructions and schemas to guide the extraction, ideal for unstructured or diverse content.
-   - Supports **OpenAI**, **Azure OpenAI**, **HuggingFace**, and **Ollama** models, enabling flexibility with both proprietary and open-source providers.
-
-#### **2. Key Components of LLM Extraction Strategy**
-   - **Provider**: Specifies the LLM provider (e.g., OpenAI, HuggingFace, Azure).
-   - **API Token**: Required for most providers, except Ollama (local LLM model).
-   - **Instruction**: Custom extraction instructions sent to the model, providing flexibility in how the data is structured and extracted.
-   - **Schema**: Optional, defines structured fields to organize extracted data into JSON format.
-   - **Extraction Type**: Supports `"block"` for simpler text blocks or `"schema"` when a structured output format is required.
-   - **Chunking Parameters**: Breaks down large documents, with options to adjust chunk size and overlap rate for more accurate extraction across lengthy texts.
-
-#### **3. Basic Extraction Example: OpenAI Model Pricing**
-   - **Goal**: Extract model names and their input and output fees from the OpenAI pricing page.
-   - **Schema Definition**:
-     - **Model Name**: Text for model identification.
-     - **Input Fee**: Token cost for input processing.
-     - **Output Fee**: Token cost for output generation.
-
-   - **Schema**:
-     ```python
-     class OpenAIModelFee(BaseModel):
-         model_name: str = Field(..., description="Name of the OpenAI model.")
-         input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-         output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
-     ```
-
-   - **Example Code**:
-     ```python
-     async def extract_openai_pricing():
-         async with AsyncWebCrawler() as crawler:
-             result = await crawler.arun(
-                 url="https://openai.com/api/pricing/",
-                 extraction_strategy=LLMExtractionStrategy(
-                     provider="openai/gpt-4o",
-                     api_token=os.getenv("OPENAI_API_KEY"),
-                     schema=OpenAIModelFee.schema(),
-                     extraction_type="schema",
-                     instruction="Extract model names and fees for input and output tokens from the page."
-                 ),
-                 cache_mode=CacheMode.BYPASS
-             )
-             print(result.extracted_content)
-     ```
-
-   - **Explanation**:
-     - The extraction strategy combines a schema and detailed instruction to guide the LLM in capturing structured data.
-     - Each model’s name, input fee, and output fee are extracted in a JSON format.
-
-#### **4. Knowledge Graph Extraction Example**
-   - **Goal**: Extract entities and their relationships from a document for use in a knowledge graph.
-   - **Schema Definition**:
-     - **Entities**: Individual items with descriptions (e.g., people, organizations).
-     - **Relationships**: Connections between entities, including descriptions and relationship types.
-
-   - **Schema**:
-     ```python
-     class Entity(BaseModel):
-         name: str
-         description: str
-
-     class Relationship(BaseModel):
-         entity1: Entity
-         entity2: Entity
-         description: str
-         relation_type: str
-
-     class KnowledgeGraph(BaseModel):
-         entities: List[Entity]
-         relationships: List[Relationship]
-     ```
-
-   - **Example Code**:
-     ```python
-     async def extract_knowledge_graph():
-         extraction_strategy = LLMExtractionStrategy(
-             provider="azure/gpt-4o-mini",
-             api_token=os.getenv("AZURE_API_KEY"),
-             schema=KnowledgeGraph.schema(),
-             extraction_type="schema",
-             instruction="Extract entities and relationships from the content to build a knowledge graph."
-         )
-         async with AsyncWebCrawler() as crawler:
-             result = await crawler.arun(
-                 url="https://example.com/some-article",
-                 extraction_strategy=extraction_strategy,
-                 cache_mode=CacheMode.BYPASS
-             )
-             print(result.extracted_content)
-     ```
-
-   - **Explanation**:
-     - In this setup, the LLM extracts entities and their relationships based on the schema and instruction.
-     - The schema organizes results into a JSON-based knowledge graph format.
-
-#### **5. Key Settings in LLM Extraction**
-   - **Chunking Options**:
-     - For long pages, set `chunk_token_threshold` to specify maximum token count per section.
-     - Adjust `overlap_rate` to control the overlap between chunks, useful for contextual consistency.
-   - **Example**:
-     ```python
-     extraction_strategy = LLMExtractionStrategy(
-         provider="openai/gpt-4",
-         api_token=os.getenv("OPENAI_API_KEY"),
-         chunk_token_threshold=3000,
-         overlap_rate=0.2,  # 20% overlap between chunks
-         instruction="Extract key insights and relationships."
-     )
-     ```
-   - This setup ensures that longer texts are divided into manageable chunks with slight overlap, enhancing the quality of extraction.
-
-#### **6. Flexible Provider Options for LLM Extraction**
-   - **Using Proprietary Models**: OpenAI, Azure, and HuggingFace provide robust language models, often suited for complex or detailed extractions.
-   - **Using Open-Source Models**: Ollama and other open-source models can be deployed locally, suitable for offline or cost-effective extraction.
-   - **Example Call**:
-     ```python
-     await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
-     await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-     await extract_structured_data_using_llm("ollama/llama3.2")   
-     ```
-
-#### **7. Complete Example of LLM Extraction Setup**
-   - Code to run both the OpenAI pricing and Knowledge Graph extractions, using various providers:
-     ```python
-     async def main():
-         await extract_openai_pricing()
-         await extract_knowledge_graph()
-     
-     if __name__ == "__main__":
-         asyncio.run(main())
-     ```
-
-#### **8. Wrap Up & Next Steps**
-   - Recap the power of LLM extraction for handling unstructured or complex data extraction tasks.
-   - Tease the next video: **10.3 Cosine Similarity Strategy** for clustering similar content based on semantic similarity.
-
----
-
-This outline explains LLM Extraction in Crawl4AI, with examples showing how to extract structured data using custom schemas and instructions. It demonstrates flexibility with multiple providers, ensuring practical application for different use cases.# Crawl4AI
-
-## Episode 11: Extraction Strategies: JSON CSS, LLM, and Cosine
-
-### Quick Intro
-Introduce JSON CSS Extraction Strategy for structured data, LLM Extraction Strategy for intelligent parsing, and Cosine Strategy for clustering similar content. Demo: Use JSON CSS to scrape product details from an e-commerce site.
-
-Here’s a structured outline for the **Cosine Similarity Strategy** video, covering key concepts, configuration, and a practical example.
-
----
-
-### **10.3 Cosine Similarity Strategy**
-
-#### **1. Introduction to Cosine Similarity Strategy**
-   - The Cosine Similarity Strategy clusters content by semantic similarity, offering an efficient alternative to LLM-based extraction, especially when speed is a priority.
-   - Ideal for grouping similar sections of text, this strategy is well-suited for pages with content sections that may need to be classified or tagged, like news articles, product descriptions, or reviews.
-
-#### **2. Key Configuration Options**
-   - **semantic_filter**: A keyword-based filter to focus on relevant content.
-   - **word_count_threshold**: Minimum number of words per cluster, filtering out shorter, less meaningful clusters.
-   - **max_dist**: Maximum allowable distance between elements in clusters, impacting cluster tightness.
-   - **linkage_method**: Method for hierarchical clustering, such as `'ward'` (for well-separated clusters).
-   - **top_k**: Specifies the number of top categories for each cluster.
-   - **model_name**: Defines the model for embeddings, such as `sentence-transformers/all-MiniLM-L6-v2`.
-   - **sim_threshold**: Minimum similarity threshold for filtering, allowing control over cluster relevance.
-
-#### **3. How Cosine Similarity Clustering Works**
-   - **Step 1**: Embeddings are generated for each text section, transforming them into vectors that capture semantic meaning.
-   - **Step 2**: Hierarchical clustering groups similar sections based on cosine similarity, forming clusters with related content.
-   - **Step 3**: Clusters are filtered based on word count, removing those below the `word_count_threshold`.
-   - **Step 4**: Each cluster is then categorized with tags, if enabled, providing context to each grouped content section.
-
-#### **4. Example Use Case: Clustering Blog Article Sections**
-   - **Goal**: Group related sections of a blog or news page to identify distinct topics or discussion areas.
-   - **Example HTML Sections**:
-     ```text
-     "The economy is showing signs of recovery, with markets up this quarter.",
-     "In the sports world, several major teams are preparing for the upcoming season.",
-     "New advancements in AI technology are reshaping the tech landscape.",
-     "Market analysts are optimistic about continued growth in tech stocks."
-     ```
-
-   - **Code Setup**:
-     ```python
-     async def extract_blog_sections():
-         extraction_strategy = CosineStrategy(
-             word_count_threshold=15,
-             max_dist=0.3,
-             sim_threshold=0.2,
-             model_name="sentence-transformers/all-MiniLM-L6-v2",
-             top_k=2
-         )
-         async with AsyncWebCrawler() as crawler:
-             url = "https://example.com/blog-page"
-             result = await crawler.arun(
-                 url=url,
-                 extraction_strategy=extraction_strategy,
-                 cache_mode=CacheMode.BYPASS
-             )
-             print(result.extracted_content)
-     ```
-
-   - **Explanation**:
-     - **word_count_threshold**: Ensures only clusters with meaningful content are included.
-     - **sim_threshold**: Filters out clusters with low similarity, focusing on closely related sections.
-     - **top_k**: Selects top tags, useful for identifying main topics.
-
-#### **5. Applying Semantic Filtering with Cosine Similarity**
-   - **Semantic Filter**: Filters sections based on relevance to a specific keyword, such as “technology” for tech articles.
-   - **Example Code**:
-     ```python
-     extraction_strategy = CosineStrategy(
-         semantic_filter="technology",
-         word_count_threshold=10,
-         max_dist=0.25,
-         model_name="sentence-transformers/all-MiniLM-L6-v2"
-     )
-     ```
-   - **Explanation**:
-     - **semantic_filter**: Only sections with high similarity to the “technology” keyword will be included in the clustering, making it easy to focus on specific topics within a mixed-content page.
-
-#### **6. Clustering Product Reviews by Similarity**
-   - **Goal**: Organize product reviews by themes, such as “price,” “quality,” or “durability.”
-   - **Example Reviews**:
-     ```text
-     "The quality of this product is outstanding and well worth the price.",
-     "I found the product to be durable but a bit overpriced.",
-     "Great value for the money and long-lasting.",
-     "The build quality is good, but I expected a lower price point."
-     ```
-
-   - **Code Setup**:
-     ```python
-     async def extract_product_reviews():
-         extraction_strategy = CosineStrategy(
-             word_count_threshold=20,
-             max_dist=0.35,
-             sim_threshold=0.25,
-             model_name="sentence-transformers/all-MiniLM-L6-v2"
-         )
-         async with AsyncWebCrawler() as crawler:
-             url = "https://example.com/product-reviews"
-             result = await crawler.arun(
-                 url=url,
-                 extraction_strategy=extraction_strategy,
-                 cache_mode=CacheMode.BYPASS
-             )
-             print(result.extracted_content)
-     ```
-
-   - **Explanation**:
-     - This configuration clusters similar reviews, grouping feedback by common themes, helping businesses understand customer sentiments around particular product aspects.
-
-#### **7. Performance Advantages of Cosine Strategy**
-   - **Speed**: The Cosine Similarity Strategy is faster than LLM-based extraction, as it doesn’t rely on API calls to external LLMs.
-   - **Local Processing**: The strategy runs locally with pre-trained sentence embeddings, ideal for high-throughput scenarios where cost and latency are concerns.
-   - **Comparison**: With a well-optimized local model, this method can perform clustering on large datasets quickly, making it suitable for tasks requiring rapid, repeated analysis.
-
-#### **8. Full Code Example for Clustering News Articles**
-   - **Code**:
-     ```python
-     async def main():
-         await extract_blog_sections()
-         await extract_product_reviews()
-     
-     if __name__ == "__main__":
-         asyncio.run(main())
-     ```
-
-#### **9. Wrap Up & Next Steps**
-   - Recap the efficiency and effectiveness of Cosine Similarity for clustering related content quickly.
-   - Close with a reminder of Crawl4AI’s flexibility across extraction strategies, and prompt users to experiment with different settings to optimize clustering for their specific content.
-
----
-
-This outline covers Cosine Similarity Strategy’s speed and effectiveness, providing examples that showcase its potential for clustering various content types efficiently.# Crawl4AI
-
-## Episode 12: Session-Based Crawling for Dynamic Websites
-
-### Quick Intro
-Show session management for handling websites with multiple pages or actions (like “load more” buttons). Demo: Crawl a paginated content page, persisting session data across multiple requests.
-
-Here’s a detailed outline for the **Session-Based Crawling for Dynamic Websites** video, explaining why sessions are necessary, how to use them, and providing practical examples and a visual diagram to illustrate the concept.
-
----
-
-### **11. Session-Based Crawling for Dynamic Websites**
-
-#### **1. Introduction to Session-Based Crawling**
-   - **What is Session-Based Crawling**: Session-based crawling maintains a continuous browsing session across multiple page states, allowing the crawler to interact with a page and retrieve content that loads dynamically or based on user interactions.
-   - **Why It’s Needed**:
-     - In static pages, all content is available directly from a single URL.
-     - In dynamic websites, content often loads progressively or based on user actions (e.g., clicking “load more,” submitting forms, scrolling).
-     - Session-based crawling helps simulate user actions, capturing content that is otherwise hidden until specific actions are taken.
-
-#### **2. Conceptual Diagram for Session-Based Crawling**
-
-   ```mermaid
-   graph TD
-       Start[Start Session] --> S1[Initial State (S1)]
-       S1 -->|Crawl| Content1[Extract Content S1]
-       S1 -->|Action: Click Load More| S2[State S2]
-       S2 -->|Crawl| Content2[Extract Content S2]
-       S2 -->|Action: Scroll Down| S3[State S3]
-       S3 -->|Crawl| Content3[Extract Content S3]
-       S3 -->|Action: Submit Form| S4[Final State]
-       S4 -->|Crawl| Content4[Extract Content S4]
-       Content4 --> End[End Session]
-   ```
-
-   - **Explanation of Diagram**:
-     - **Start**: Initializes the session and opens the starting URL.
-     - **State Transitions**: Each action (e.g., clicking “load more,” scrolling) transitions to a new state, where additional content becomes available.
-     - **Session Persistence**: Keeps the same browsing session active, preserving the state and allowing for a sequence of actions to unfold.
-     - **End**: After reaching the final state, the session ends, and all accumulated content has been extracted.
-
-#### **3. Key Components of Session-Based Crawling in Crawl4AI**
-   - **Session ID**: A unique identifier to maintain the state across requests, allowing the crawler to “remember” previous actions.
-   - **JavaScript Execution**: Executes JavaScript commands (e.g., clicks, scrolls) to simulate interactions.
-   - **Wait Conditions**: Ensures the crawler waits for content to load in each state before moving on.
-   - **Sequential State Transitions**: By defining actions and wait conditions between states, the crawler can navigate through the page as a user would.
-
-#### **4. Basic Session Example: Multi-Step Content Loading**
-   - **Goal**: Crawl an article feed that requires several “load more” clicks to display additional content.
-   - **Code**:
-     ```python
-     async def crawl_article_feed():
-         async with AsyncWebCrawler() as crawler:
-             session_id = "feed_session"
-             
-             for page in range(3):
-                 result = await crawler.arun(
-                     url="https://example.com/articles",
-                     session_id=session_id,
-                     js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
-                     wait_for="css:.article",
-                     css_selector=".article"  # Target article elements
-                 )
-                 print(f"Page {page + 1}: Extracted {len(result.extracted_content)} articles")
-     ```
-   - **Explanation**:
-     - **session_id**: Ensures all requests share the same browsing state.
-     - **js_code**: Clicks the “load more” button after the initial page load, expanding content on each iteration.
-     - **wait_for**: Ensures articles have loaded after each click before extraction.
-
-#### **5. Advanced Example: E-Commerce Product Search with Filter Selection**
-   - **Goal**: Interact with filters on an e-commerce page to extract products based on selected criteria.
-   - **Example Steps**:
-     1. **State 1**: Load the main product page.
-     2. **State 2**: Apply a filter (e.g., “On Sale”) by selecting a checkbox.
-     3. **State 3**: Scroll to load additional products and capture updated results.
-
-   - **Code**:
-     ```python
-     async def extract_filtered_products():
-         async with AsyncWebCrawler() as crawler:
-             session_id = "product_session"
-             
-             # Step 1: Open product page
-             result = await crawler.arun(
-                 url="https://example.com/products",
-                 session_id=session_id,
-                 wait_for="css:.product-item"
-             )
-             
-             # Step 2: Apply filter (e.g., "On Sale")
-             result = await crawler.arun(
-                 url="https://example.com/products",
-                 session_id=session_id,
-                 js_code="document.querySelector('#sale-filter-checkbox').click();",
-                 wait_for="css:.product-item"
-             )
-
-             # Step 3: Scroll to load additional products
-             for _ in range(2):  # Scroll down twice
-                 result = await crawler.arun(
-                     url="https://example.com/products",
-                     session_id=session_id,
-                     js_code="window.scrollTo(0, document.body.scrollHeight);",
-                     wait_for="css:.product-item"
-                 )
-                 print(f"Loaded {len(result.extracted_content)} products after scroll")
-     ```
-   - **Explanation**:
-     - **State Persistence**: Each action (filter selection and scroll) builds on the previous session state.
-     - **Multiple Interactions**: Combines clicking a filter with scrolling, demonstrating how the session preserves these actions.
-
-#### **6. Key Benefits of Session-Based Crawling**
-   - **Accessing Hidden Content**: Retrieves data that loads only after user actions.
-   - **Simulating User Behavior**: Handles interactive elements such as “load more” buttons, dropdowns, and filters.
-   - **Maintaining Continuity Across States**: Enables a sequential process, moving logically from one state to the next, capturing all desired content without reloading the initial state each time.
-
-#### **7. Additional Configuration Tips**
-   - **Manage Session End**: Always conclude the session after the final state to release resources.
-   - **Optimize with Wait Conditions**: Use `wait_for` to ensure complete loading before each extraction.
-   - **Handling Errors in Session-Based Crawling**: Include error handling for interactions that may fail, ensuring robustness across state transitions.
-
-#### **8. Complete Code Example: Multi-Step Session Workflow**
-   - **Example**:
-     ```python
-     async def main():
-         await crawl_article_feed()
-         await extract_filtered_products()
-     
-     if __name__ == "__main__":
-         asyncio.run(main())
-     ```
-
-#### **9. Wrap Up & Next Steps**
-   - Recap the usefulness of session-based crawling for dynamic content extraction.
-   - Tease the next video: **Hooks and Custom Workflow with AsyncWebCrawler** to cover advanced customization options for further control over the crawling process.
-
----
-
-This outline covers session-based crawling from both a conceptual and practical perspective, helping users understand its importance, configure it effectively, and use it to handle complex dynamic content.# Crawl4AI
-
-## Episode 13: Chunking Strategies for Large Text Processing
-
-### Quick Intro
-Explain Regex, NLP, and Fixed-Length chunking, and when to use each. Demo: Chunk a large article or document for processing by topics or sentences.
-
-Here’s a structured outline for the **Chunking Strategies for Large Text Processing** video, emphasizing how chunking works within extraction and why it’s crucial for effective data aggregation.
-
-Here’s a structured outline for the **Chunking Strategies for Large Text Processing** video, explaining each strategy, when to use it, and providing examples to illustrate.
-
----
-
-### **12. Chunking Strategies for Large Text Processing**
-
-#### **1. Introduction to Chunking in Crawl4AI**
-   - **What is Chunking**: Chunking is the process of dividing large text into manageable sections or “chunks,” enabling efficient processing in extraction tasks.
-   - **Why It’s Needed**:
-     - When processing large text, feeding it directly into an extraction function (like `F(x)`) can overwhelm memory or token limits.
-     - Chunking breaks down `x` (the text) into smaller pieces, which are processed sequentially or in parallel by the extraction function, with the final result being an aggregation of all chunks’ processed output.
-
-#### **2. Key Chunking Strategies and Use Cases**
-   - Crawl4AI offers various chunking strategies to suit different text structures, chunk sizes, and processing requirements.
-   - **Choosing a Strategy**: Select based on the type of text (e.g., articles, transcripts) and extraction needs (e.g., simple splitting or context-sensitive processing).
-
-#### **3. Strategy 1: Regex-Based Chunking**
-   - **Description**: Uses regular expressions to split text based on specified patterns (e.g., paragraphs or section breaks).
-   - **Use Case**: Ideal for dividing text by paragraphs or larger logical blocks where sections are clearly separated by line breaks or punctuation.
-   - **Example**:
-     - **Pattern**: `r'\n\n'` for double line breaks.
-     ```python
-     chunker = RegexChunking(patterns=[r'\n\n'])
-     text_chunks = chunker.chunk(long_text)
-     print(text_chunks)  # Output: List of paragraphs
-     ```
-   - **Pros**: Flexible for pattern-based chunking.
-   - **Cons**: Limited to text with consistent formatting.
-
-#### **4. Strategy 2: NLP Sentence-Based Chunking**
-   - **Description**: Uses NLP to split text by sentences, ensuring grammatically complete segments.
-   - **Use Case**: Useful for extracting individual statements, such as in news articles, quotes, or legal text.
-   - **Example**:
-     ```python
-     chunker = NlpSentenceChunking()
-     sentence_chunks = chunker.chunk(long_text)
-     print(sentence_chunks)  # Output: List of sentences
-     ```
-   - **Pros**: Maintains sentence structure, ideal for tasks needing semantic completeness.
-   - **Cons**: May create very small chunks, which could limit contextual extraction.
-
-#### **5. Strategy 3: Topic-Based Segmentation Using TextTiling**
-   - **Description**: Segments text into topics using TextTiling, identifying topic shifts and key segments.
-   - **Use Case**: Ideal for long articles, reports, or essays where each section covers a different topic.
-   - **Example**:
-     ```python
-     chunker = TopicSegmentationChunking(num_keywords=3)
-     topic_chunks = chunker.chunk_with_topics(long_text)
-     print(topic_chunks)  # Output: List of topic segments with keywords
-     ```
-   - **Pros**: Groups related content, preserving topical coherence.
-   - **Cons**: Depends on identifiable topic shifts, which may not be present in all texts.
-
-#### **6. Strategy 4: Fixed-Length Word Chunking**
-   - **Description**: Splits text into chunks based on a fixed number of words.
-   - **Use Case**: Ideal for text where exact segment size is required, such as processing word-limited documents for LLMs.
-   - **Example**:
-     ```python
-     chunker = FixedLengthWordChunking(chunk_size=100)
-     word_chunks = chunker.chunk(long_text)
-     print(word_chunks)  # Output: List of 100-word chunks
-     ```
-   - **Pros**: Ensures uniform chunk sizes, suitable for token-based extraction limits.
-   - **Cons**: May split sentences, affecting semantic coherence.
-
-#### **7. Strategy 5: Sliding Window Chunking**
-   - **Description**: Uses a fixed window size with a step, creating overlapping chunks to maintain context.
-   - **Use Case**: Useful for maintaining context across sections, as with documents where context is needed for neighboring sections.
-   - **Example**:
-     ```python
-     chunker = SlidingWindowChunking(window_size=100, step=50)
-     window_chunks = chunker.chunk(long_text)
-     print(window_chunks)  # Output: List of overlapping word chunks
-     ```
-   - **Pros**: Retains context across adjacent chunks, ideal for complex semantic extraction.
-   - **Cons**: Overlap increases data size, potentially impacting processing time.
-
-#### **8. Strategy 6: Overlapping Window Chunking**
-   - **Description**: Similar to sliding windows but with a defined overlap, allowing chunks to share content at the edges.
-   - **Use Case**: Suitable for handling long texts with essential overlapping information, like research articles or medical records.
-   - **Example**:
-     ```python
-     chunker = OverlappingWindowChunking(window_size=1000, overlap=100)
-     overlap_chunks = chunker.chunk(long_text)
-     print(overlap_chunks)  # Output: List of overlapping chunks with defined overlap
-     ```
-   - **Pros**: Allows controlled overlap for consistent content coverage across chunks.
-   - **Cons**: Redundant data in overlapping areas may increase computation.
-
-#### **9. Practical Example: Using Chunking with an Extraction Strategy**
-   - **Goal**: Combine chunking with an extraction strategy to process large text effectively.
-   - **Example Code**:
-     ```python
-     from crawl4ai.extraction_strategy import LLMExtractionStrategy
-
-     async def extract_large_text():
-         # Initialize chunker and extraction strategy
-         chunker = FixedLengthWordChunking(chunk_size=200)
-         extraction_strategy = LLMExtractionStrategy(provider="openai/gpt-4", api_token="your_api_token")
-         
-         # Split text into chunks
-         text_chunks = chunker.chunk(large_text)
-         
-         async with AsyncWebCrawler() as crawler:
-             for chunk in text_chunks:
-                 result = await crawler.arun(
-                     url="https://example.com",
-                     extraction_strategy=extraction_strategy,
-                     content=chunk
-                 )
-                 print(result.extracted_content)
-     ```
-
-   - **Explanation**:
-     - `chunker.chunk()`: Divides the `large_text` into smaller segments based on the chosen strategy.
-     - `extraction_strategy`: Processes each chunk separately, and results are then aggregated to form the final output.
-
-#### **10. Choosing the Right Chunking Strategy**
-   - **Text Structure**: If text has clear sections (e.g., paragraphs, topics), use Regex or Topic Segmentation.
-   - **Extraction Needs**: If context is crucial, consider Sliding or Overlapping Window Chunking.
-   - **Processing Constraints**: For word-limited extractions (e.g., LLMs with token limits), Fixed-Length Word Chunking is often most effective.
-
-#### **11. Wrap Up & Next Steps**
-   - Recap the benefits of each chunking strategy and when to use them in extraction workflows.
-   - Tease the next video: **Hooks and Custom Workflow with AsyncWebCrawler**, focusing on customizing crawler behavior with hooks for a fine-tuned extraction process.
-
----
-
-This outline provides a complete understanding of chunking strategies, explaining each method’s strengths and best-use scenarios to help users process large texts effectively in Crawl4AI.# Crawl4AI
-
-## Episode 14: Hooks and Custom Workflow with AsyncWebCrawler
-
-### Quick Intro
-Cover hooks (`on_browser_created`, `before_goto`, `after_goto`) to add custom workflows. Demo: Use hooks to add custom cookies or headers, log HTML, or trigger specific events on page load.
-
-Here’s a detailed outline for the **Hooks and Custom Workflow with AsyncWebCrawler** video, covering each hook’s purpose, usage, and example implementations.
-
----
-
-### **13. Hooks and Custom Workflow with AsyncWebCrawler**
-
-#### **1. Introduction to Hooks in Crawl4AI**
-   - **What are Hooks**: Hooks are customizable entry points in the crawling process that allow users to inject custom actions or logic at specific stages.
-   - **Why Use Hooks**:
-     - They enable fine-grained control over the crawling workflow.
-     - Useful for performing additional tasks (e.g., logging, modifying headers) dynamically during the crawl.
-     - Hooks provide the flexibility to adapt the crawler to complex site structures or unique project needs.
-
-#### **2. Overview of Available Hooks**
-   - Crawl4AI offers seven key hooks to modify and control different stages in the crawling lifecycle:
-     - `on_browser_created`
-     - `on_user_agent_updated`
-     - `on_execution_started`
-     - `before_goto`
-     - `after_goto`
-     - `before_return_html`
-     - `before_retrieve_html`
-
-#### **3. Hook-by-Hook Explanation and Examples**
-
----
-
-##### **Hook 1: `on_browser_created`**
-   - **Purpose**: Triggered right after the browser instance is created.
-   - **Use Case**:
-     - Initializing browser-specific settings or performing setup actions.
-     - Configuring browser extensions or scripts before any page is opened.
-   - **Example**:
-     ```python
-     async def log_browser_creation(browser):
-         print("Browser instance created:", browser)
-     
-     crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation)
-     ```
-   - **Explanation**: This hook logs the browser creation event, useful for tracking when a new browser instance starts.
-
----
-
-##### **Hook 2: `on_user_agent_updated`**
-   - **Purpose**: Called whenever the user agent string is updated.
-   - **Use Case**:
-     - Modifying the user agent based on page requirements, e.g., changing to a mobile user agent for mobile-only pages.
-   - **Example**:
-     ```python
-     def update_user_agent(user_agent):
-         print(f"User Agent Updated: {user_agent}")
-     
-     crawler.crawler_strategy.set_hook('on_user_agent_updated', update_user_agent)
-     crawler.update_user_agent("Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)")
-     ```
-   - **Explanation**: This hook provides a callback every time the user agent changes, helpful for debugging or dynamically altering user agent settings based on conditions.
-
----
-
-##### **Hook 3: `on_execution_started`**
-   - **Purpose**: Called right before the crawler begins any interaction (e.g., JavaScript execution, clicks).
-   - **Use Case**:
-     - Performing setup actions, such as inserting cookies or initiating custom scripts.
-   - **Example**:
-     ```python
-     async def log_execution_start(page):
-         print("Execution started on page:", page.url)
-     
-     crawler.crawler_strategy.set_hook('on_execution_started', log_execution_start)
-     ```
-   - **Explanation**: Logs the start of any major interaction on the page, ideal for cases where you want to monitor each interaction.
-
----
-
-##### **Hook 4: `before_goto`**
-   - **Purpose**: Triggered before navigating to a new URL with `page.goto()`.
-   - **Use Case**:
-     - Modifying request headers or setting up conditions right before the page loads.
-     - Adding headers or dynamically adjusting options for specific URLs.
-   - **Example**:
-     ```python
-     async def modify_headers_before_goto(page):
-         await page.set_extra_http_headers({"X-Custom-Header": "CustomValue"})
-         print("Custom headers set before navigation")
-     
-     crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto)
-     ```
-   - **Explanation**: This hook allows injecting headers or altering settings based on the page’s needs, particularly useful for pages with custom requirements.
-
----
-
-##### **Hook 5: `after_goto`**
-   - **Purpose**: Executed immediately after a page has loaded (after `page.goto()`).
-   - **Use Case**:
-     - Checking the loaded page state, modifying the DOM, or performing post-navigation actions (e.g., scrolling).
-   - **Example**:
-     ```python
-     async def post_navigation_scroll(page):
-         await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
-         print("Scrolled to the bottom after navigation")
-     
-     crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll)
-     ```
-   - **Explanation**: This hook scrolls to the bottom of the page after loading, which can help load dynamically added content like infinite scroll elements.
-
----
-
-##### **Hook 6: `before_return_html`**
-   - **Purpose**: Called right before HTML content is retrieved and returned.
-   - **Use Case**:
-     - Removing overlays or cleaning up the page for a cleaner HTML extraction.
-   - **Example**:
-     ```python
-     async def remove_advertisements(page, html):
-         await page.evaluate("document.querySelectorAll('.ad-banner').forEach(el => el.remove());")
-         print("Advertisements removed before returning HTML")
-     
-     crawler.crawler_strategy.set_hook('before_return_html', remove_advertisements)
-     ```
-   - **Explanation**: The hook removes ad banners from the HTML before it’s retrieved, ensuring a cleaner data extraction.
-
----
-
-##### **Hook 7: `before_retrieve_html`**
-   - **Purpose**: Runs right before Crawl4AI initiates HTML retrieval.
-   - **Use Case**:
-     - Finalizing any page adjustments (e.g., setting timers, waiting for specific elements).
-   - **Example**:
-     ```python
-     async def wait_for_content_before_retrieve(page):
-         await page.wait_for_selector('.main-content')
-         print("Main content loaded, ready to retrieve HTML")
-     
-     crawler.crawler_strategy.set_hook('before_retrieve_html', wait_for_content_before_retrieve)
-     ```
-   - **Explanation**: This hook waits for the main content to load before retrieving the HTML, ensuring that all essential content is captured.
-
-#### **4. Setting Hooks in Crawl4AI**
-   - **How to Set Hooks**:
-     - Use `set_hook` to define a custom function for each hook.
-     - Each hook function can be asynchronous (useful for actions like waiting or retrieving async data).
-   - **Example Setup**:
-     ```python
-     crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation)
-     crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto)
-     crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll)
-     ```
-
-#### **5. Complete Example: Using Hooks for a Customized Crawl Workflow**
-   - **Goal**: Log each key step, set custom headers before navigation, and clean up the page before retrieving HTML.
-   - **Example Code**:
-     ```python
-     async def custom_crawl():
-         async with AsyncWebCrawler() as crawler:
-             # Set hooks for custom workflow
-             crawler.crawler_strategy.set_hook('on_browser_created', log_browser_creation)
-             crawler.crawler_strategy.set_hook('before_goto', modify_headers_before_goto)
-             crawler.crawler_strategy.set_hook('after_goto', post_navigation_scroll)
-             crawler.crawler_strategy.set_hook('before_return_html', remove_advertisements)
-             
-             # Perform the crawl
-             url = "https://example.com"
-             result = await crawler.arun(url=url)
-             print(result.html)  # Display or process HTML
-     ```
-
-#### **6. Benefits of Using Hooks in Custom Crawling Workflows**
-   - **Enhanced Control**: Hooks offer precise control over each stage, allowing adjustments based on content and structure.
-   - **Efficient Modifications**: Avoid reloading or restarting the session; hooks can alter actions dynamically.
-   - **Context-Sensitive Actions**: Hooks enable custom logic tailored to specific pages or sections, maximizing extraction quality.
-
-#### **7. Wrap Up & Next Steps**
-   - Recap how hooks empower customized workflows in Crawl4AI, enabling flexibility at every stage.
-   - Tease the next video: **Automating Post-Processing with Crawl4AI**, covering automated steps after data extraction.
-
----
-
-This outline provides a thorough understanding of hooks, their practical applications, and examples for customizing the crawling workflow in Crawl4AI.
\ No newline at end of file
diff --git a/docs/md_v3/tutorials/async-webcrawler-basics.md b/docs/md_v3/tutorials/async-webcrawler-basics.md
deleted file mode 100644
index 6236d899..00000000
--- a/docs/md_v3/tutorials/async-webcrawler-basics.md
+++ /dev/null
@@ -1,235 +0,0 @@
-Below is a sample Markdown file (`tutorials/async-webcrawler-basics.md`) illustrating how you might teach new users the fundamentals of `AsyncWebCrawler`. This tutorial builds on the **Getting Started** section by introducing key configuration parameters and the structure of the crawl result. Feel free to adjust the code snippets, wording, or format to match your style.
-
----
-
-# AsyncWebCrawler Basics
-
-In this tutorial, you’ll learn how to:
-
-1. Create and configure an `AsyncWebCrawler` instance  
-2. Understand the `CrawlResult` object returned by `arun()`  
-3. Use basic `BrowserConfig` and `CrawlerRunConfig` options to tailor your crawl
-
-> **Prerequisites**  
-> - You’ve already completed the [Getting Started](./getting-started.md) tutorial (or have equivalent knowledge).  
-> - You have **Crawl4AI** installed and configured with Playwright.
-
----
-
-## 1. What is `AsyncWebCrawler`?
-
-`AsyncWebCrawler` is the central class for running asynchronous crawling operations in Crawl4AI. It manages browser sessions, handles dynamic pages (if needed), and provides you with a structured result object for each crawl. Essentially, it’s your high-level interface for collecting page data.
-
-```python
-from crawl4ai import AsyncWebCrawler
-
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun("https://example.com")
-    print(result)
-```
-
----
-
-## 2. Creating a Basic `AsyncWebCrawler` Instance
-
-Below is a simple code snippet showing how to create and use `AsyncWebCrawler`. This goes one step beyond the minimal example you saw in [Getting Started](./getting-started.md).
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler
-from crawl4ai import BrowserConfig, CrawlerRunConfig
-
-async def main():
-    # 1. Set up configuration objects (optional if you want defaults)
-    browser_config = BrowserConfig(
-        browser_type="chromium",
-        headless=True,
-        verbose=True
-    )
-    crawler_config = CrawlerRunConfig(
-        page_timeout=30000,     # 30 seconds
-        wait_for_images=True,
-        verbose=True
-    )
-
-    # 2. Initialize AsyncWebCrawler with your chosen browser config
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        # 3. Run a single crawl
-        url_to_crawl = "https://example.com"
-        result = await crawler.arun(url=url_to_crawl, config=crawler_config)
-        
-        # 4. Inspect the result
-        if result.success:
-            print(f"Successfully crawled: {result.url}")
-            print(f"HTML length: {len(result.html)}")
-            print(f"Markdown snippet: {result.markdown[:200]}...")
-        else:
-            print(f"Failed to crawl {result.url}. Error: {result.error_message}")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### Key Points
-
-1. **`BrowserConfig`** is optional, but it’s the place to specify browser-related settings (e.g., `headless`, `browser_type`).
-2. **`CrawlerRunConfig`** deals with how you want the crawler to behave for this particular run (timeouts, waiting for images, etc.).
-3. **`arun()`** is the main method to crawl a single URL. We’ll see how `arun_many()` works in later tutorials.
-
----
-
-## 3. Understanding `CrawlResult`
-
-When you call `arun()`, you get back a `CrawlResult` object containing all the relevant data from that crawl attempt. Some common fields include:
-
-```python
-class CrawlResult(BaseModel):
-    url: str
-    html: str
-    success: bool
-    cleaned_html: Optional[str] = None
-    media: Dict[str, List[Dict]] = {}
-    links: Dict[str, List[Dict]] = {}
-    screenshot: Optional[str] = None  # base64-encoded screenshot if requested
-    pdf: Optional[bytes] = None       # binary PDF data if requested
-    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
-    markdown_v2: Optional[MarkdownGenerationResult] = None
-    error_message: Optional[str] = None
-    # ... plus other fields like status_code, ssl_certificate, extracted_content, etc.
-```
-
-### Commonly Used Fields
-
-- **`success`**: `True` if the crawl succeeded, `False` otherwise.  
-- **`html`**: The raw HTML (or final rendered state if JavaScript was executed).  
-- **`markdown` / `markdown_v2`**: Contains the automatically generated Markdown representation of the page.  
-- **`media`**: A dictionary with lists of extracted images, videos, or audio elements.  
-- **`links`**: A dictionary with lists of “internal” and “external” link objects.  
-- **`error_message`**: If `success` is `False`, this often contains a description of the error.
-
-**Example**:
-
-```python
-if result.success:
-    print("Page Title or snippet of HTML:", result.html[:200])
-    if result.markdown:
-        print("Markdown snippet:", result.markdown[:200])
-    print("Links found:", len(result.links.get("internal", [])), "internal links")
-else:
-    print("Error crawling:", result.error_message)
-```
-
----
-
-## 4. Relevant Basic Parameters
-
-Below are a few `BrowserConfig` and `CrawlerRunConfig` parameters you might tweak early on. We’ll cover more advanced ones (like proxies, PDF, or screenshots) in later tutorials.
-
-### 4.1 `BrowserConfig` Essentials
-
-| Parameter          | Description                                               | Default        |
-|--------------------|-----------------------------------------------------------|----------------|
-| `browser_type`     | Which browser engine to use: `"chromium"`, `"firefox"`, `"webkit"` | `"chromium"`   |
-| `headless`         | Run the browser with no UI window. If `False`, you see the browser. | `True`         |
-| `verbose`          | Print extra logs for debugging.                          | `True`         |
-| `java_script_enabled` | Toggle JavaScript. When `False`, you might speed up loads but lose dynamic content. | `True`         |
-
-### 4.2 `CrawlerRunConfig` Essentials
-
-| Parameter             | Description                                                  | Default            |
-|-----------------------|--------------------------------------------------------------|--------------------|
-| `page_timeout`        | Maximum time in ms to wait for the page to load or scripts. | `30000` (30s)      |
-| `wait_for_images`     | Wait for images to fully load. Good for accurate rendering.  | `True`             |
-| `css_selector`        | Target only certain elements for extraction.                | `None`             |
-| `excluded_tags`       | Skip certain HTML tags (like `nav`, `footer`, etc.)          | `None`             |
-| `verbose`             | Print logs for debugging.                                    | `True`             |
-
-> **Tip**: Don’t worry if you see lots of parameters. You’ll learn them gradually in later tutorials.
-
----
-
-## 5. Windows-Specific Configuration
-
-When using AsyncWebCrawler on Windows, you might encounter a `NotImplementedError` related to `asyncio.create_subprocess_exec`. This is a known Windows-specific issue that occurs because Windows' default event loop doesn't support subprocess operations.
-
-To resolve this, Crawl4AI provides a utility function to configure Windows to use the ProactorEventLoop. Call this function before running any async operations:
-
-```python
-from crawl4ai.utils import configure_windows_event_loop
-
-# Call this before any async operations if you're on Windows
-configure_windows_event_loop()
-
-# Your AsyncWebCrawler code here
-```
-
----
-
-## 6. Putting It All Together
-
-Here’s a slightly more in-depth example that shows off a few key config parameters at once:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler
-from crawl4ai import BrowserConfig, CrawlerRunConfig
-
-async def main():
-    browser_cfg = BrowserConfig(
-        browser_type="chromium",
-        headless=True,
-        java_script_enabled=True,
-        verbose=False
-    )
-
-    crawler_cfg = CrawlerRunConfig(
-        page_timeout=30000,  # wait up to 30 seconds
-        wait_for_images=True,
-        css_selector=".article-body",  # only extract content under this CSS selector
-        verbose=True
-    )
-
-    async with AsyncWebCrawler(config=browser_cfg) as crawler:
-        result = await crawler.arun("https://news.example.com", config=crawler_cfg)
-
-        if result.success:
-            print("[OK] Crawled:", result.url)
-            print("HTML length:", len(result.html))
-            print("Extracted Markdown:", result.markdown_v2.raw_markdown[:300])
-        else:
-            print("[ERROR]", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Key Observations**:
-- `css_selector=".article-body"` ensures we only focus on the main content region.  
-- `page_timeout=30000` helps if the site is slow.  
-- We turned off `verbose` logs for the browser but kept them on for the crawler config.  
-
----
-
-## 7. Next Steps
-
-- **Smart Crawling Techniques**: Learn to handle iframes, advanced caching, and selective extraction in the [next tutorial](./smart-crawling.md).
-- **Hooks & Custom Code**: See how to inject custom logic before and after navigation in a dedicated [Hooks Tutorial](./hooks-custom.md).
-- **Reference**: For a complete list of every parameter in `BrowserConfig` and `CrawlerRunConfig`, check out the [Reference section](../../reference/configuration.md).
-
----
-
-## Summary
-
-You now know the basics of **AsyncWebCrawler**:
-- How to create it with optional browser/crawler configs
-- How `arun()` works for single-page crawls
-- Where to find your crawled data in `CrawlResult`
-- A handful of frequently used configuration parameters
-
-From here, you can refine your crawler to handle more advanced scenarios, like focusing on specific content or dealing with dynamic elements. Let’s move on to **[Smart Crawling Techniques](./smart-crawling.md)** to learn how to handle iframes, advanced caching, and more.
-
----
-
-**Last updated**: 2024-XX-XX
-
-Keep exploring! If you get stuck, remember to check out the [How-To Guides](../../how-to/) for targeted solutions or the [Explanations](../../explanations/) for deeper conceptual background.
\ No newline at end of file
diff --git a/docs/md_v3/tutorials/docker-quickstart.md b/docs/md_v3/tutorials/docker-quickstart.md
deleted file mode 100644
index 73070baa..00000000
--- a/docs/md_v3/tutorials/docker-quickstart.md
+++ /dev/null
@@ -1,271 +0,0 @@
-# Deploying with Docker (Quickstart)
-
-> **⚠️ WARNING: Experimental & Legacy**  
-> Our current Docker solution for Crawl4AI is **not stable** and **will be discontinued** soon. A more robust Docker/Orchestration strategy is in development, with a planned stable release in **2025**. If you choose to use this Docker approach, please proceed cautiously and avoid production deployment without thorough testing.
-
-Crawl4AI is **open-source** and under **active development**. We appreciate your interest, but strongly recommend you make **informed decisions** if you need a production environment. Expect breaking changes in future versions.
-
----
-
-## 1. Installation & Environment Setup (Outside Docker)
-
-Before we jump into Docker usage, here’s a quick reminder of how to install Crawl4AI locally (legacy doc). For **non-Docker** deployments or local dev:
-
-```bash
-# 1. Install the package
-pip install crawl4ai
-crawl4ai-setup
-
-# 2. Install playwright dependencies (all browsers or specific ones)
-playwright install --with-deps
-# or
-playwright install --with-deps chromium
-# or
-playwright install --with-deps chrome
-```
-
-**Testing** your installation:
-
-```bash
-# Visible browser test
-python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=False); page = browser.new_page(); page.goto('https://example.com'); input('Press Enter to close...')"
-```
-
----
-
-## 2. Docker Overview
-
-This Docker approach allows you to run a **Crawl4AI** service via REST API. You can:
-
-1. **POST** a request (e.g., URLs, extraction config)  
-2. **Retrieve** your results from a task-based endpoint  
-
-> **Note**: This Docker solution is **temporary**. We plan a more robust, stable Docker approach in the near future. For now, you can experiment, but do not rely on it for mission-critical production.
-
----
-
-## 3. Pulling and Running the Image
-
-### Basic Run
-
-```bash
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
-```
-
-This starts a container on port `11235`. You can `POST` requests to `http://localhost:11235/crawl`.
-
-### Using an API Token
-
-```bash
-docker run -p 11235:11235 \
-  -e CRAWL4AI_API_TOKEN=your_secret_token \
-  unclecode/crawl4ai:basic
-```
-
-If **`CRAWL4AI_API_TOKEN`** is set, you must include `Authorization: Bearer <token>` in your requests. Otherwise, the service is open to anyone.
-
----
-
-## 4. Docker Compose for Multi-Container Workflows
-
-You can also use **Docker Compose** to manage multiple services. Below is an **experimental** snippet:
-
-```yaml
-version: '3.8'
-
-services:
-  crawl4ai:
-    image: unclecode/crawl4ai:basic
-    ports:
-      - "11235:11235"
-    environment:
-      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
-      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
-    # Additional env variables as needed
-    volumes:
-      - /dev/shm:/dev/shm
-```
-
-To run:
-
-```bash
-docker-compose up -d
-```
-
-And to stop:
-
-```bash
-docker-compose down
-```
-
-**Troubleshooting**:
-
-- **Check logs**: `docker-compose logs -f crawl4ai`
-- **Remove orphan containers**: `docker-compose down --remove-orphans`
-- **Remove networks**: `docker network rm <network_name>`
-
----
-
-## 5. Making Requests to the Container
-
-**Base URL**: `http://localhost:11235`
-
-### Example: Basic Crawl
-
-```python
-import requests
-
-task_request = {
-    "urls": "https://example.com",
-    "priority": 10
-}
-
-response = requests.post("http://localhost:11235/crawl", json=task_request)
-task_id = response.json()["task_id"]
-
-# Poll for status
-status_url = f"http://localhost:11235/task/{task_id}"
-status = requests.get(status_url).json()
-print(status)
-```
-
-If you used an API token, do:
-
-```python
-headers = {"Authorization": "Bearer your_secret_token"}
-response = requests.post(
-    "http://localhost:11235/crawl",
-    headers=headers,
-    json=task_request
-)
-```
-
----
-
-## 6. Docker + New Crawler Config Approach
-
-### Using `BrowserConfig` & `CrawlerRunConfig` in Requests
-
-The Docker-based solution can accept **crawler configurations** in the request JSON (legacy doc might show direct parameters, but we want to embed them in `crawler_params` or `extra` to align with the new approach). For example:
-
-```python
-import requests
-
-request_data = {
-    "urls": "https://www.nbcnews.com/business",
-    "crawler_params": {
-        "headless": True,
-        "browser_type": "chromium",
-        "verbose": True,
-        "page_timeout": 30000,
-        # ... any other BrowserConfig-like fields
-    },
-    "extra": {
-        "word_count_threshold": 50,
-        "bypass_cache": True
-    }
-}
-
-response = requests.post("http://localhost:11235/crawl", json=request_data)
-task_id = response.json()["task_id"]
-```
-
-This is the recommended style if you want to replicate `BrowserConfig` and `CrawlerRunConfig` settings in Docker mode.
-
----
-
-## 7. Example: JSON Extraction in Docker
-
-```python
-import requests
-import json
-
-# Define a schema for CSS extraction
-schema = {
-    "name": "Coinbase Crypto Prices",
-    "baseSelector": ".cds-tableRow-t45thuk",
-    "fields": [
-        {
-            "name": "crypto",
-            "selector": "td:nth-child(1) h2",
-            "type": "text"
-        },
-        {
-            "name": "symbol",
-            "selector": "td:nth-child(1) p",
-            "type": "text"
-        },
-        {
-            "name": "price",
-            "selector": "td:nth-child(2)",
-            "type": "text"
-        }
-    ]
-}
-
-request_data = {
-    "urls": "https://www.coinbase.com/explore",
-    "extraction_config": {
-        "type": "json_css",
-        "params": {"schema": schema}
-    },
-    "crawler_params": {
-        "headless": True,
-        "verbose": True
-    }
-}
-
-resp = requests.post("http://localhost:11235/crawl", json=request_data)
-task_id = resp.json()["task_id"]
-
-# Poll for status
-status = requests.get(f"http://localhost:11235/task/{task_id}").json()
-if status["status"] == "completed":
-    extracted_content = status["result"]["extracted_content"]
-    data = json.loads(extracted_content)
-    print("Extracted:", len(data), "entries")
-else:
-    print("Task still in progress or failed.")
-```
-
----
-
-## 8. Why This Docker Is Temporary
-
-**We are building a new, stable approach**:
-
-- The current Docker container is **experimental** and might break with future releases.  
-- We plan a stable release in **2025** with a more robust API, versioning, and orchestration.  
-- If you use this Docker in production, do so at your own risk and be prepared for **breaking changes**.
-
-**Community**: Because Crawl4AI is open-source, you can track progress or contribute to the new Docker approach. Check the [GitHub repository](https://github.com/unclecode/crawl4ai) for roadmaps and updates.
-
----
-
-## 9. Known Limitations & Next Steps
-
-1. **Not Production-Ready**: This Docker approach lacks extensive security, logging, or advanced config for large-scale usage.  
-2. **Ongoing Changes**: Expect API changes. The official stable version is targeted for **2025**.  
-3. **LLM Integrations**: Docker images are big if you want GPU or multiple model providers. We might unify these in a future build.  
-4. **Performance**: For concurrency or large crawls, you may need to tune resources (memory, CPU) and watch out for ephemeral storage.  
-5. **Version Pinning**: If you must deploy, pin your Docker tag to a specific version (e.g., `:basic-0.3.7`) to avoid surprise updates.
-
-### Next Steps
-
-- **Watch the Repository**: For announcements on the new Docker architecture.  
-- **Experiment**: Use this Docker for test or dev environments, but keep an eye out for breakage.  
-- **Contribute**: If you have ideas or improvements, open a PR or discussion.  
-- **Check Roadmaps**: See our [GitHub issues](https://github.com/unclecode/crawl4ai/issues) or [Roadmap doc](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md) to find upcoming releases.
-
----
-
-## 10. Summary
-
-**Deploying with Docker** can simplify running Crawl4AI as a service. However:
-
-- **This Docker** approach is **legacy** and subject to removal/overhaul.  
-- For production, please weigh the risks carefully.  
-- Detailed “new Docker approach” is coming in **2025**.
-
-We hope this guide helps you do a quick spin-up of Crawl4AI in Docker for **experimental** usage. Stay tuned for the fully-supported version!
\ No newline at end of file
diff --git a/docs/md_v3/tutorials/getting-started.md b/docs/md_v3/tutorials/getting-started.md
deleted file mode 100644
index b148e6e1..00000000
--- a/docs/md_v3/tutorials/getting-started.md
+++ /dev/null
@@ -1,272 +0,0 @@
-# Getting Started with Crawl4AI
-
-Welcome to **Crawl4AI**, an open-source LLM friendly Web Crawler & Scraper. In this tutorial, you’ll:
-
-1. **Install** Crawl4AI (both via pip and Docker, with notes on platform challenges).
-2. Run your **first crawl** using minimal configuration.
-3. Generate **Markdown** output (and learn how it’s influenced by content filters).
-4. Experiment with a simple **CSS-based extraction** strategy.
-5. See a glimpse of **LLM-based extraction** (including open-source and closed-source model options).
-
----
-
-## 1. Introduction
-
-Crawl4AI provides:
-- An asynchronous crawler, **`AsyncWebCrawler`**.
-- Configurable browser and run settings via **`BrowserConfig`** and **`CrawlerRunConfig`**.
-- Automatic HTML-to-Markdown conversion via **`DefaultMarkdownGenerator`** (supports additional filters).
-- Multiple extraction strategies (LLM-based or “traditional” CSS/XPath-based).
-
-By the end of this guide, you’ll have installed Crawl4AI, performed a basic crawl, generated Markdown, and tried out two extraction strategies.
-
----
-
-## 2. Installation
-
-### 2.1 Python + Playwright
-
-#### Basic Pip Installation
-
-```bash
-pip install crawl4ai
-crawl4ai-setup
-
-# Verify your installation
-crawl4ai-doctor
-```
-
-If you encounter any browser-related issues, you can install them manually:
-```bash
-python -m playwright install --with-deps chrome chromium
-```
-
-- **`crawl4ai-setup`** installs and configures Playwright (Chromium by default).
-
-We cover advanced installation and Docker in the [Installation](#installation) section.
-
----
-
-## 3. Your First Crawl
-
-Here’s a minimal Python script that creates an **`AsyncWebCrawler`**, fetches a webpage, and prints the first 300 characters of its Markdown output:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler
-
-async def main():
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun("https://example.com")
-        print(result.markdown[:300])  # Print first 300 chars
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**What’s happening?**
-- **`AsyncWebCrawler`** launches a headless browser (Chromium by default).
-- It fetches `https://example.com`.
-- Crawl4AI automatically converts the HTML into Markdown.
-
-You now have a simple, working crawl!
-
----
-
-## 4. Basic Configuration (Light Introduction)
-
-Crawl4AI’s crawler can be heavily customized using two main classes:
-
-1. **`BrowserConfig`**: Controls browser behavior (headless or full UI, user agent, JavaScript toggles, etc.).
-2. **`CrawlerRunConfig`**: Controls how each crawl runs (caching, extraction, timeouts, hooking, etc.).
-
-Below is an example with minimal usage:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-async def main():
-    browser_conf = BrowserConfig(headless=True)  # or False to see the browser
-    run_conf = CrawlerRunConfig(cache_mode="BYPASS")
-
-    async with AsyncWebCrawler(config=browser_conf) as crawler:
-        result = await crawler.arun(
-            url="https://example.com",
-            config=run_conf
-        )
-        print(result.markdown)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.
-
----
-
-## 5. Generating Markdown Output
-
-By default, Crawl4AI automatically generates Markdown from each crawled page. However, the exact output depends on whether you specify a **markdown generator** or **content filter**.
-
-- **`result.markdown`**:  
-  The direct HTML-to-Markdown conversion.  
-- **`result.markdown.fit_markdown`**:  
-  The same content after applying any configured **content filter** (e.g., `PruningContentFilter`).
-
-### Example: Using a Filter with `DefaultMarkdownGenerator`
-
-```python
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.content_filter_strategy import PruningContentFilter
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-
-md_generator = DefaultMarkdownGenerator(
-    content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
-)
-
-config = CrawlerRunConfig(markdown_generator=md_generator)
-
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun("https://news.ycombinator.com", config=config)
-    print("Raw Markdown length:", len(result.markdown.raw_markdown))
-    print("Fit Markdown length:", len(result.markdown.fit_markdown))
-```
-
-**Note**: If you do **not** specify a content filter or markdown generator, you’ll typically see only the raw Markdown. We’ll dive deeper into these strategies in a dedicated **Markdown Generation** tutorial.
-
----
-
-## 6. Simple Data Extraction (CSS-based)
-
-Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. Below is a minimal CSS-based example:
-
-```python
-import asyncio
-import json
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-async def main():
-    schema = {
-        "name": "Example Items",
-        "baseSelector": "div.item",
-        "fields": [
-            {"name": "title", "selector": "h2", "type": "text"},
-            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
-        ]
-    }
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://example.com/items",
-            config=CrawlerRunConfig(
-                extraction_strategy=JsonCssExtractionStrategy(schema)
-            )
-        )
-        # The JSON output is stored in 'extracted_content'
-        data = json.loads(result.extracted_content)
-        print(data)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Why is this helpful?**
-- Great for repetitive page structures (e.g., item listings, articles).
-- No AI usage or costs. 
-- The crawler returns a JSON string you can parse or store.
-
----
-
-## 7. Simple Data Extraction (LLM-based)
-
-For more complex or irregular pages, a language model can parse text intelligently into a structure you define. Crawl4AI supports **open-source** or **closed-source** providers:
-
-- **Open-Source Models** (e.g., `ollama/llama3.3`, `no_token`)  
-- **OpenAI Models** (e.g., `openai/gpt-4`, requires `api_token`)  
-- Or any provider supported by the underlying library
-
-Below is an example using **open-source** style (no token) and closed-source:
-
-```python
-import os
-import json
-import asyncio
-from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
-
-class PricingInfo(BaseModel):
-    model_name: str = Field(..., description="Name of the AI model")
-    input_fee: str = Field(..., description="Fee for input tokens")
-    output_fee: str = Field(..., description="Fee for output tokens")
-
-async def main():
-    # 1) Open-Source usage: no token required
-    llm_strategy_open_source = LLMExtractionStrategy(
-        provider="ollama/llama3.3",  # or "any-other-local-model"
-        api_token="no_token",       # for local models, no API key is typically required
-        schema=PricingInfo.schema(),
-        extraction_type="schema",
-        instruction="""
-            From this page, extract all AI model pricing details in JSON format.
-            Each entry should have 'model_name', 'input_fee', and 'output_fee'.
-        """,
-        temperature=0
-    )
-
-    # 2) Closed-Source usage: API key for OpenAI, for example
-    openai_token = os.getenv("OPENAI_API_KEY", "sk-YOUR_API_KEY")
-    llm_strategy_openai = LLMExtractionStrategy(
-        provider="openai/gpt-4",
-        api_token=openai_token,
-        schema=PricingInfo.schema(),
-        extraction_type="schema",
-        instruction="""
-            From this page, extract all AI model pricing details in JSON format.
-            Each entry should have 'model_name', 'input_fee', and 'output_fee'.
-        """,
-        temperature=0
-    )
-
-    # We'll demo the open-source approach here
-    config = CrawlerRunConfig(extraction_strategy=llm_strategy_open_source)
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://example.com/pricing",
-            config=config
-        )
-        print("LLM-based extraction JSON:", result.extracted_content)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**What’s happening?**
-- We define a Pydantic schema (`PricingInfo`) describing the fields we want.
-- The LLM extraction strategy uses that schema and your instructions to transform raw text into structured JSON.  
-- Depending on the **provider** and **api_token**, you can use local models or a remote API.
-
----
-
-## 8. Next Steps
-
-Congratulations! You have:
-1. Installed Crawl4AI (via pip, with Docker as an option).
-2. Performed a simple crawl and printed Markdown.
-3. Seen how adding a **markdown generator** + **content filter** can produce “fit” Markdown.
-4. Experimented with **CSS-based** extraction for repetitive data.
-5. Learned the basics of **LLM-based** extraction (open-source and closed-source).
-
-If you are ready for more, check out:
-
-- **Installation**: Learn more on how to install Crawl4AI and set up Playwright.
-- **Focus on Configuration**: Learn to customize browser settings, caching modes, advanced timeouts, etc.
-- **Markdown Generation Basics**: Dive deeper into content filtering and “fit markdown” usage.
-- **Dynamic Pages & Hooks**: Tackle sites with “Load More” buttons, login forms, or JavaScript complexities.
-- **Deployment**: Run Crawl4AI in Docker containers and scale across multiple nodes.
-- **Explanations & How-To Guides**: Explore browser contexts, identity-based crawling, hooking, performance, and more.
-
-Crawl4AI is a powerful tool for extracting data and generating Markdown from virtually any website. Enjoy exploring, and we hope you build amazing AI-powered applications with it!
diff --git a/docs/md_v3/tutorials/getting-warmer.md b/docs/md_v3/tutorials/getting-warmer.md
deleted file mode 100644
index b2deb414..00000000
--- a/docs/md_v3/tutorials/getting-warmer.md
+++ /dev/null
@@ -1,527 +0,0 @@
-# Crawl4AI Quick Start Guide: Your All-in-One AI-Ready Web Crawling & AI Integration Solution
-
-Crawl4AI, the **#1 trending GitHub repository**, streamlines web content extraction into AI-ready formats. Perfect for AI assistants, semantic search engines, or data pipelines, Crawl4AI transforms raw HTML into structured Markdown or JSON effortlessly. Integrate with LLMs, open-source models, or your own retrieval-augmented generation workflows.
-
-**What Crawl4AI is not:**
-
-Crawl4AI is not a replacement for traditional web scraping libraries, Selenium, or Playwright. It's not designed as a general-purpose web automation tool. Instead, Crawl4AI has a specific, focused goal:
-
--   To generate perfect, AI-friendly data (particularly for LLMs) from web content
--   To maximize speed and efficiency in data extraction and processing
--   To operate at scale, from Raspberry Pi to cloud infrastructures
-
-Crawl4AI is engineered with a "scale-first" mindset, aiming to handle millions of links while maintaining exceptional performance. It's super efficient and fast, optimized to:
-
-1. Transform raw web content into structured, LLM-ready formats (Markdown/JSON)
-2. Implement intelligent extraction strategies to reduce reliance on costly API calls
-3. Provide a streamlined pipeline for AI data preparation and ingestion
-
-In essence, Crawl4AI bridges the gap between web content and AI systems, focusing on delivering high-quality, processed data rather than offering broad web automation capabilities.
-
-**Key Links:**
-
--   **Website:** [https://crawl4ai.com](https://crawl4ai.com)
--   **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
--   **Colab Notebook:** [Try on Google Colab](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing)
--   **Quickstart Code Example:** [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py)
--   **Examples Folder:** [Crawl4AI Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples)
-
----
-
-## Table of Contents
-
-- [Crawl4AI Quick Start Guide: Your All-in-One AI-Ready Web Crawling \& AI Integration Solution](#crawl4ai-quick-start-guide-your-all-in-one-ai-ready-web-crawling--ai-integration-solution)
-  - [Table of Contents](#table-of-contents)
-  - [1. Introduction \& Key Concepts](#1-introduction--key-concepts)
-  - [2. Installation \& Environment Setup](#2-installation--environment-setup)
-    - [Test Your Installation](#test-your-installation)
-  - [3. Core Concepts \& Configuration](#3-core-concepts--configuration)
-  - [4. Basic Crawling \& Simple Extraction](#4-basic-crawling--simple-extraction)
-  - [5. Markdown Generation \& AI-Optimized Output](#5-markdown-generation--ai-optimized-output)
-  - [6. Structured Data Extraction (CSS, XPath, LLM)](#6-structured-data-extraction-css-xpath-llm)
-  - [7. Advanced Extraction: LLM \& Open-Source Models](#7-advanced-extraction-llm--open-source-models)
-  - [8. Page Interactions, JS Execution, \& Dynamic Content](#8-page-interactions-js-execution--dynamic-content)
-  - [9. Media, Links, \& Metadata Handling](#9-media-links--metadata-handling)
-  - [10. Authentication \& Identity Preservation](#10-authentication--identity-preservation)
-    - [Manual Setup via User Data Directory](#manual-setup-via-user-data-directory)
-    - [Using `storage_state`](#using-storage_state)
-  - [11. Proxy \& Security Enhancements](#11-proxy--security-enhancements)
-  - [12. Screenshots, PDFs \& File Downloads](#12-screenshots-pdfs--file-downloads)
-  - [13. Caching \& Performance Optimization](#13-caching--performance-optimization)
-  - [14. Hooks for Custom Logic](#14-hooks-for-custom-logic)
-  - [15. Dockerization \& Scaling](#15-dockerization--scaling)
-  - [16. Troubleshooting \& Common Pitfalls](#16-troubleshooting--common-pitfalls)
-  - [17. Comprehensive End-to-End Example](#17-comprehensive-end-to-end-example)
-  - [18. Further Resources \& Community](#18-further-resources--community)
-
----
-
-## 1. Introduction & Key Concepts
-
-Crawl4AI transforms websites into structured, AI-friendly data. It efficiently handles large-scale crawling, integrates with both proprietary and open-source LLMs, and optimizes content for semantic search or RAG pipelines.
-
-**Quick Test:**
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler
-
-async def test_run():
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun("https://example.com")
-        print(result.markdown)
-
-asyncio.run(test_run())
-```
-
-If you see Markdown output, everything is working!
-
-**More info:** [See /docs/introduction](#) or [1_introduction.ex.md](https://github.com/unclecode/crawl4ai/blob/main/introduction.ex.md)
-
----
-
-## 2. Installation & Environment Setup
-
-```bash
-# Install the package
-pip install crawl4ai
-crawl4ai-setup
-
-# Install Playwright with system dependencies (recommended)
-playwright install --with-deps  # Installs all browsers
-
-# Or install specific browsers:
-playwright install --with-deps chrome  # Recommended for Colab/Linux
-playwright install --with-deps firefox
-playwright install --with-deps webkit
-playwright install --with-deps chromium
-
-# Keep Playwright updated periodically
-playwright install
-```
-
-> **Note**: For Google Colab and some Linux environments, use `chrome` instead of `chromium` - it tends to work more reliably.
-
-### Test Your Installation
-Try these one-liners:
-
-```python
-# Visible browser test
-python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=False); page = browser.new_page(); page.goto('https://example.com'); input('Press Enter to close...')"
-
-# Headless test (for servers/CI)
-python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=True); page = browser.new_page(); page.goto('https://example.com'); print(f'Title: {page.title()}'); browser.close()"
-```
-
-You should see a browser window (in visible test) loading example.com. If you get errors, try with Firefox using `playwright install --with-deps firefox`.
-
-
-**Try in Colab:**  
-[Open Colab Notebook](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing)
-
-**More info:** [See /docs/configuration](#) or [2_configuration.md](https://github.com/unclecode/crawl4ai/blob/main/configuration.md)
-
----
-
-## 3. Core Concepts & Configuration
-
-Use `AsyncWebCrawler`, `CrawlerRunConfig`, and `BrowserConfig` to control crawling.
-
-**Example config:**
-
-```python
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
-
-browser_config = BrowserConfig(
-    headless=True,
-    verbose=True,
-    viewport_width=1080,
-    viewport_height=600,
-    text_mode=False,
-    ignore_https_errors=True,
-    java_script_enabled=True
-)
-
-run_config = CrawlerRunConfig(
-    css_selector="article.main",
-    word_count_threshold=50,
-    excluded_tags=['nav','footer'],
-    exclude_external_links=True,
-    wait_for="css:.article-loaded",
-    page_timeout=60000,
-    delay_before_return_html=1.0,
-    mean_delay=0.1,
-    max_range=0.3,
-    process_iframes=True,
-    remove_overlay_elements=True,
-    js_code="""
-        (async () => {
-            window.scrollTo(0, document.body.scrollHeight);
-            await new Promise(r => setTimeout(r, 2000));
-            document.querySelector('.load-more')?.click();
-        })();
-    """
-)
-
-# Use: ENABLED, DISABLED, BYPASS, READ_ONLY, WRITE_ONLY
-# run_config.cache_mode = CacheMode.ENABLED
-```
-
-**Prefixes:**
-
--   `http://` or `https://` for live pages
--   `file://local.html` for local
--   `raw:<html>` for raw HTML strings
-
-**More info:** [See /docs/async_webcrawler](#) or [3_async_webcrawler.ex.md](https://github.com/unclecode/crawl4ai/blob/main/async_webcrawler.ex.md)
-
----
-
-## 4. Basic Crawling & Simple Extraction
-
-```python
-async with AsyncWebCrawler(config=browser_config) as crawler:
-    result = await crawler.arun("https://news.example.com/article", config=run_config)
-    print(result.markdown) # Basic markdown content
-```
-
-**More info:** [See /docs/browser_context_page](#) or [4_browser_context_page.ex.md](https://github.com/unclecode/crawl4ai/blob/main/browser_context_page.ex.md)
-
----
-
-## 5. Markdown Generation & AI-Optimized Output
-
-After crawling, `result.markdown_v2` provides:
-
--   `raw_markdown`: Unfiltered markdown
--   `markdown_with_citations`: Links as references at the bottom
--   `references_markdown`: A separate list of reference links
--   `fit_markdown`: Filtered, relevant markdown (e.g., after BM25)
--   `fit_html`: The HTML used to produce `fit_markdown`
-
-**Example:**
-
-```python
-print("RAW:", result.markdown_v2.raw_markdown[:200])
-print("CITED:", result.markdown_v2.markdown_with_citations[:200])
-print("REFERENCES:", result.markdown_v2.references_markdown)
-print("FIT MARKDOWN:", result.markdown_v2.fit_markdown)
-```
-
-For AI training, `fit_markdown` focuses on the most relevant content.
-
-**More info:** [See /docs/markdown_generation](#) or [5_markdown_generation.ex.md](https://github.com/unclecode/crawl4ai/blob/main/markdown_generation.ex.md)
-
----
-
-## 6. Structured Data Extraction (CSS, XPath, LLM)
-
-Extract JSON data without LLMs:
-
-**CSS:**
-
-```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-schema = {
-  "name": "Products",
-  "baseSelector": ".product",
-  "fields": [
-    {"name": "title", "selector": "h2", "type": "text"},
-    {"name": "price", "selector": ".price", "type": "text"}
-  ]
-}
-run_config.extraction_strategy = JsonCssExtractionStrategy(schema)
-```
-
-**XPath:**
-
-```python
-from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
-
-xpath_schema = {
-  "name": "Articles",
-  "baseSelector": "//div[@class='article']",
-  "fields": [
-    {"name":"headline","selector":".//h1","type":"text"},
-    {"name":"summary","selector":".//p[@class='summary']","type":"text"}
-  ]
-}
-run_config.extraction_strategy = JsonXPathExtractionStrategy(xpath_schema)
-```
-
-**More info:** [See /docs/extraction_strategies](#) or [7_extraction_strategies.ex.md](https://github.com/unclecode/crawl4ai/blob/main/extraction_strategies.ex.md)
-
----
-
-## 7. Advanced Extraction: LLM & Open-Source Models
-
-Use LLMExtractionStrategy for complex tasks. Works with OpenAI or open-source models (e.g., Ollama).
-
-```python
-from pydantic import BaseModel
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
-
-class TravelData(BaseModel):
-    destination: str
-    attractions: list
-
-run_config.extraction_strategy = LLMExtractionStrategy(
-    provider="ollama/nemotron",
-    schema=TravelData.schema(),
-    instruction="Extract destination and top attractions."
-)
-```
-
-**More info:** [See /docs/extraction_strategies](#) or [7_extraction_strategies.ex.md](https://github.com/unclecode/crawl4ai/blob/main/extraction_strategies.ex.md)
-
----
-
-## 8. Page Interactions, JS Execution, & Dynamic Content
-
-Insert `js_code` and use `wait_for` to ensure content loads. Example:
-
-```python
-run_config.js_code = """
-(async () => {
-   document.querySelector('.load-more')?.click();
-   await new Promise(r => setTimeout(r, 2000));
-})();
-"""
-run_config.wait_for = "css:.item-loaded"
-```
-
-**More info:** [See /docs/page_interaction](#) or [11_page_interaction.md](https://github.com/unclecode/crawl4ai/blob/main/page_interaction.md)
-
----
-
-## 9. Media, Links, & Metadata Handling
-
-`result.media["images"]`: List of images with `src`, `score`, `alt`. Score indicates relevance.
-
-`result.media["videos"]`, `result.media["audios"]` similarly hold media info.
-
-`result.links["internal"]`, `result.links["external"]`, `result.links["social"]`: Categorized links. Each link has `href`, `text`, `context`, `type`.
-
-`result.metadata`: Title, description, keywords, author.
-
-**Example:**
-
-```python
-# Images
-for img in result.media["images"]:
-    print("Image:", img["src"], "Score:", img["score"], "Alt:", img.get("alt","N/A"))
-
-# Links
-for link in result.links["external"]:
-    print("External Link:", link["href"], "Text:", link["text"])
-
-# Metadata
-print("Page Title:", result.metadata["title"])
-print("Description:", result.metadata["description"])
-```
-
-**More info:** [See /docs/content_selection](#) or [8_content_selection.ex.md](https://github.com/unclecode/crawl4ai/blob/main/content_selection.ex.md)
-
----
-
-## 10. Authentication & Identity Preservation
-
-### Manual Setup via User Data Directory
-
-1. **Open Chrome with a custom user data dir:**
-
-    ```bash
-    "C:\Program Files\Google\Chrome\Application\chrome.exe" --user-data-dir="C:\MyChromeProfile"
-    ```
-
-    On macOS:
-
-    ```bash
-    "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --user-data-dir="/Users/username/ChromeProfiles/MyProfile"
-    ```
-
-2. **Log in to sites, solve CAPTCHAs, adjust settings manually.**  
-   The browser saves cookies/localStorage in that directory.
-
-3. **Use `user_data_dir` in `BrowserConfig`:**
-
-    ```python
-    browser_config = BrowserConfig(
-        headless=True,
-        user_data_dir="/Users/username/ChromeProfiles/MyProfile"
-    )
-    ```
-
-    Now the crawler starts with those cookies, sessions, etc.
-
-### Using `storage_state`
-
-Alternatively, export and reuse storage states:
-
-```python
-browser_config = BrowserConfig(
-    headless=True,
-    storage_state="mystate.json"  # Pre-saved state
-)
-```
-
-No repeated logins needed.
-
-**More info:** [See /docs/storage_state](#) or [16_storage_state.md](https://github.com/unclecode/crawl4ai/blob/main/storage_state.md)
-
----
-
-## 11. Proxy & Security Enhancements
-
-Use `proxy_config` for authenticated proxies:
-
-```python
-browser_config.proxy_config = {
-    "server": "http://proxy.example.com:8080",
-    "username": "proxyuser",
-    "password": "proxypass"
-}
-```
-
-Combine with `headers` or `ignore_https_errors` as needed.
-
-**More info:** [See /docs/proxy_security](#) or [14_proxy_security.md](https://github.com/unclecode/crawl4ai/blob/main/proxy_security.md)
-
----
-
-## 12. Screenshots, PDFs & File Downloads
-
-Enable `screenshot=True` or `pdf=True` in `CrawlerRunConfig`:
-
-```python
-run_config.screenshot = True
-run_config.pdf = True
-```
-
-After crawling:
-
-```python
-if result.screenshot:
-    with open("page.png", "wb") as f:
-        f.write(result.screenshot)
-
-if result.pdf:
-    with open("page.pdf", "wb") as f:
-        f.write(result.pdf)
-```
-
-**File Downloads:**
-
-```python
-browser_config.accept_downloads = True
-browser_config.downloads_path = "./downloads"
-run_config.js_code = """document.querySelector('a.download')?.click();"""
-
-# After crawl:
-print("Downloaded files:", result.downloaded_files)
-```
-
-**More info:** [See /docs/screenshot_and_pdf_export](#) or [15_screenshot_and_pdf_export.md](https://github.com/unclecode/crawl4ai/blob/main/screenshot_and_pdf_export.md)  
-Also [10_file_download.md](https://github.com/unclecode/crawl4ai/blob/main/file_download.md)
-
----
-
-## 13. Caching & Performance Optimization
-
-Set `cache_mode` to reuse fetch results:
-
-```python
-from crawl4ai import CacheMode
-run_config.cache_mode = CacheMode.ENABLED
-```
-
-Adjust delays, increase concurrency, or use `text_mode=True` for faster extraction.
-
-**More info:** [See /docs/cache_modes](#) or [9_cache_modes.md](https://github.com/unclecode/crawl4ai/blob/main/cache_modes.md)
-
----
-
-## 14. Hooks for Custom Logic
-
-Hooks let you run code at specific lifecycle events without creating pages manually in `on_browser_created`.
-
-Use `on_page_context_created` to apply routing or modify page contexts before crawling the URL:
-
-**Example Hook:**
-
-```python
-async def on_page_context_created_hook(context, page, **kwargs):
-    # Block all images to speed up load
-    await context.route("**/*.{png,jpg,jpeg}", lambda route: route.abort())
-    print("[HOOK] Image requests blocked")
-
-async with AsyncWebCrawler(config=browser_config) as crawler:
-    crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created_hook)
-    result = await crawler.arun("https://imageheavy.example.com", config=run_config)
-    print("Crawl finished with images blocked.")
-```
-
-This hook is clean and doesn’t create a separate page itself—it just modifies the current context/page setup.
-
-**More info:** [See /docs/hooks_auth](#) or [13_hooks_auth.md](https://github.com/unclecode/crawl4ai/blob/main/hooks_auth.md)
-
----
-
-## 15. Dockerization & Scaling
-
-Use Docker images:
-
--   AMD64 basic:
-
-```bash
-docker pull unclecode/crawl4ai:basic-amd64
-docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64
-```
-
--   ARM64 for M1/M2:
-
-```bash
-docker pull unclecode/crawl4ai:basic-arm64
-docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64
-```
-
--   GPU support:
-
-```bash
-docker pull unclecode/crawl4ai:gpu-amd64
-docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu-amd64
-```
-
-Scale with load balancers or Kubernetes.
-
-**More info:** [See /docs/proxy_security (for proxy) or relevant Docker instructions in README](#)
-
----
-
-## 16. Troubleshooting & Common Pitfalls
-
--   Empty results? Relax filters, check selectors.
--   Timeouts? Increase `page_timeout` or refine `wait_for`.
--   CAPTCHAs? Use `user_data_dir` or `storage_state` after manual solving.
--   JS errors? Try headful mode for debugging.
-
-Check [examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) & [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) for more code.
-
----
-
-## 17. Comprehensive End-to-End Example
-
-Combine hooks, JS execution, PDF saving, LLM extraction—see [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) for a full example.
-
----
-
-## 18. Further Resources & Community
-
--   **Docs:** [https://crawl4ai.com](https://crawl4ai.com)
--   **Issues & PRs:** [https://github.com/unclecode/crawl4ai/issues](https://github.com/unclecode/crawl4ai/issues)
-
-Follow [@unclecode](https://x.com/unclecode) for news & community updates.
-
-**Happy Crawling!**  
-Leverage Crawl4AI to feed your AI models with clean, structured web data today.
diff --git a/docs/md_v3/tutorials/hooks-custom.md b/docs/md_v3/tutorials/hooks-custom.md
deleted file mode 100644
index 2f144065..00000000
--- a/docs/md_v3/tutorials/hooks-custom.md
+++ /dev/null
@@ -1,335 +0,0 @@
-# Hooks & Custom Code
-
-Crawl4AI supports a **hook** system that lets you run your own Python code at specific points in the crawling pipeline. By injecting logic into these hooks, you can automate tasks like:
-
-- **Authentication** (log in before navigating)  
-- **Content manipulation** (modify HTML, inject scripts, etc.)  
-- **Session or browser configuration** (e.g., adjusting user agents, local storage)  
-- **Custom data collection** (scrape extra details or track state at each stage)
-
-In this tutorial, you’ll learn about:
-
-1. What hooks are available  
-2. How to attach code to each hook  
-3. Practical examples (auth flows, user agent changes, content manipulation, etc.)
-
-> **Prerequisites**  
-> - Familiar with [AsyncWebCrawler Basics](./async-webcrawler-basics.md).  
-> - Comfortable with Python async/await.
-
----
-
-## 1. Overview of Available Hooks
-
-| Hook Name                | Called When / Purpose                                           | Context / Objects Provided                         |
-|--------------------------|-----------------------------------------------------------------|-----------------------------------------------------|
-| **`on_browser_created`** | Immediately after the browser is launched, but **before** any page or context is created. | **Browser** object only (no `page` yet). Use it for broad browser-level config. |
-| **`on_page_context_created`** | Right after a new page context is created. Perfect for setting default timeouts, injecting scripts, etc. | Typically provides `page` and `context`.           |
-| **`on_user_agent_updated`** | Whenever the user agent changes. For advanced user agent logic or additional header updates. | Typically provides `page` and updated user agent string. |
-| **`on_execution_started`** | Right before your main crawling logic runs (before rendering the page). Good for one-time setup or variable initialization. | Typically provides `page`, possibly `context`.      |
-| **`before_goto`**        | Right before navigating to the URL (i.e., `page.goto(...)`). Great for setting cookies, altering the URL, or hooking in authentication steps. | Typically provides `page`, `context`, and `goto_params`. |
-| **`after_goto`**         | Immediately after navigation completes, but before scraping. For post-login checks or initial content adjustments. | Typically provides `page`, `context`, `response`.   |
-| **`before_retrieve_html`** | Right before retrieving or finalizing the page’s HTML content. Good for in-page manipulation (e.g., removing ads or disclaimers). | Typically provides `page` or final HTML reference.  |
-| **`before_return_html`** | Just before the HTML is returned to the crawler pipeline. Last chance to alter or sanitize content. | Typically provides final HTML or a `page`.          |
-
-### A Note on `on_browser_created` (the “unbrowser” hook)
-- **No `page`** object is available because no page context exists yet. You can, however, set up browser-wide properties.  
-- For example, you might control [CDP sessions][cdp] or advanced browser flags here.
-
----
-
-## 2. Registering Hooks
-
-You can attach hooks by calling:
-
-```python
-crawler.crawler_strategy.set_hook("hook_name", your_hook_function)
-```
-
-or by passing a `hooks` dictionary to `AsyncWebCrawler` or your strategy constructor:
-
-```python
-hooks = {
-    "before_goto": my_before_goto_hook,
-    "after_goto": my_after_goto_hook,
-    # ... etc.
-}
-async with AsyncWebCrawler(hooks=hooks) as crawler:
-    ...
-```
-
-### Hook Signature
-
-Each hook is a function (async or sync, depending on your usage) that receives **certain parameters**—most often `page`, `context`, or custom arguments relevant to that stage. The library then awaits or calls your hook before continuing.
-
----
-
-## 3. Real-Life Examples
-
-Below are concrete scenarios where hooks come in handy.
-
----
-
-### 3.1 Authentication Before Navigation
-
-One of the most frequent tasks is logging in or applying authentication **before** the crawler navigates to a URL (so that the user is recognized immediately).
-
-#### Using `before_goto`
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-async def before_goto_auth_hook(page, context, goto_params, **kwargs):
-    """
-    Example: Set cookies or localStorage to simulate login.
-    This hook runs right before page.goto() is called.
-    """
-    # Example: Insert cookie-based auth or local storage data
-    # (You could also do more complex actions, like fill forms if you already have a 'page' open.)
-    print("[HOOK] Setting auth data before goto.")
-    await context.add_cookies([
-        {
-            "name": "session",
-            "value": "abcd1234",
-            "domain": "example.com",
-            "path": "/"
-        }
-    ])
-    # Optionally manipulate goto_params if needed:
-    # goto_params["url"] = goto_params["url"] + "?debug=1"
-
-async def main():
-    hooks = {
-        "before_goto": before_goto_auth_hook
-    }
-
-    browser_cfg = BrowserConfig(headless=True)
-    crawler_cfg = CrawlerRunConfig()
-
-    async with AsyncWebCrawler(config=browser_cfg, hooks=hooks) as crawler:
-        result = await crawler.arun(url="https://example.com/protected", config=crawler_cfg)
-        if result.success:
-            print("[OK] Logged in and fetched protected page.")
-        else:
-            print("[ERROR]", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Key Points**  
-- `before_goto` receives `page`, `context`, `goto_params` so you can add cookies, localStorage, or even change the URL itself.  
-- If you need to run a real login flow (submitting forms), consider `on_browser_created` or `on_page_context_created` if you want to do it once at the start.
-
----
-
-### 3.2 Setting Up the Browser in `on_browser_created`
-
-If you need to do advanced browser-level configuration (e.g., hooking into the Chrome DevTools Protocol, adjusting command-line flags, etc.), you’ll use `on_browser_created`. No `page` is available yet, but you can set up the **browser** instance itself.
-
-```python
-async def on_browser_created_hook(browser, **kwargs):
-    """
-    Runs immediately after the browser is created, before any pages.
-    'browser' here is a Playwright Browser object.
-    """
-    print("[HOOK] Browser created. Setting up custom stuff.")
-    # Possibly connect to DevTools or create an incognito context
-    # Example (pseudo-code):
-    # devtools_url = await browser.new_context(devtools=True)
-
-# Usage:
-async with AsyncWebCrawler(hooks={"on_browser_created": on_browser_created_hook}) as crawler:
-    ...
-```
-
----
-
-### 3.3 Adjusting Page or Context in `on_page_context_created`
-
-If you’d like to set default timeouts or inject scripts right after a page context is spun up:
-
-```python
-async def on_page_context_created_hook(page, context, **kwargs):
-    print("[HOOK] Page context created. Setting default timeouts or scripts.")
-    await page.set_default_timeout(20000)  # 20 seconds
-    # Possibly inject a script or set user locale
-
-# Usage:
-hooks = {
-    "on_page_context_created": on_page_context_created_hook
-}
-```
-
----
-
-### 3.4 Dynamically Updating User Agents
-
-`on_user_agent_updated` is fired whenever the strategy updates the user agent. For instance, you might want to set certain cookies or console-log changes for debugging:
-
-```python
-async def on_user_agent_updated_hook(page, context, new_ua, **kwargs):
-    print(f"[HOOK] User agent updated to {new_ua}")
-    # Maybe add a custom header based on new UA
-    await context.set_extra_http_headers({"X-UA-Source": new_ua})
-
-hooks = {
-    "on_user_agent_updated": on_user_agent_updated_hook
-}
-```
-
----
-
-### 3.5 Initializing Stuff with `on_execution_started`
-
-`on_execution_started` runs before your main crawling logic. It’s a good place for short, one-time setup tasks (like clearing old caches, or storing a timestamp).
-
-```python
-async def on_execution_started_hook(page, context, **kwargs):
-    print("[HOOK] Execution started. Setting a start timestamp or logging.")
-    context.set_default_navigation_timeout(45000)  # 45s if your site is slow
-
-hooks = {
-    "on_execution_started": on_execution_started_hook
-}
-```
-
----
-
-### 3.6 Post-Processing with `after_goto`
-
-After the crawler finishes navigating (i.e., the page has presumably loaded), you can do additional checks or manipulations—like verifying you’re on the right page, or removing interstitials:
-
-```python
-async def after_goto_hook(page, context, response, **kwargs):
-    """
-    Called right after page.goto() finishes, but before the crawler extracts HTML.
-    """
-    if response and response.ok:
-        print("[HOOK] After goto. Status:", response.status)
-        # Maybe remove popups or check if we landed on a login failure page.
-        await page.evaluate("""() => {
-            const popup = document.querySelector(".annoying-popup");
-            if (popup) popup.remove();
-        }""")
-    else:
-        print("[HOOK] Navigation might have failed, status not ok or no response.")
-
-hooks = {
-    "after_goto": after_goto_hook
-}
-```
-
----
-
-### 3.7 Last-Minute Modifications in `before_retrieve_html` or `before_return_html`
-
-Sometimes you need to tweak the page or raw HTML right before it’s captured.
-
-```python
-async def before_retrieve_html_hook(page, context, **kwargs):
-    """
-    Modify the DOM just before the crawler finalizes the HTML.
-    """
-    print("[HOOK] Removing adverts before capturing HTML.")
-    await page.evaluate("""() => {
-        const ads = document.querySelectorAll(".ad-banner");
-        ads.forEach(ad => ad.remove());
-    }""")
-
-async def before_return_html_hook(page, context, html, **kwargs):
-    """
-    'html' is the near-finished HTML string. Return an updated string if you like.
-    """
-    # For example, remove personal data or certain tags from the final text
-    print("[HOOK] Sanitizing final HTML.")
-    sanitized_html = html.replace("PersonalInfo:", "[REDACTED]")
-    return sanitized_html
-
-hooks = {
-    "before_retrieve_html": before_retrieve_html_hook,
-    "before_return_html": before_return_html_hook
-}
-```
-
-**Note**: If you want to make last-second changes in `before_return_html`, you can manipulate the `html` string directly. Return a new string if you want to override.
-
----
-
-## 4. Putting It All Together
-
-You can combine multiple hooks in a single run. For instance:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-async def on_browser_created_hook(browser, **kwargs):
-    print("[HOOK] Browser is up, no page yet. Good for broad config.")
-
-async def before_goto_auth_hook(page, context, goto_params, **kwargs):
-    print("[HOOK] Adding cookies for auth.")
-    await context.add_cookies([{"name": "session", "value": "abcd1234", "domain": "example.com"}])
-
-async def after_goto_log_hook(page, context, response, **kwargs):
-    if response:
-        print("[HOOK] after_goto: Status code:", response.status)
-
-async def main():
-    hooks = {
-        "on_browser_created": on_browser_created_hook,
-        "before_goto": before_goto_auth_hook,
-        "after_goto": after_goto_log_hook
-    }
-
-    browser_cfg = BrowserConfig(headless=True)
-    crawler_cfg = CrawlerRunConfig(verbose=True)
-
-    async with AsyncWebCrawler(config=browser_cfg, hooks=hooks) as crawler:
-        result = await crawler.arun("https://example.com/protected", config=crawler_cfg)
-        if result.success:
-            print("[OK] Protected page length:", len(result.html))
-        else:
-            print("[ERROR]", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-This example:
-
-1. **`on_browser_created`** sets up the brand-new browser instance.  
-2. **`before_goto`** ensures you inject an auth cookie before accessing the page.  
-3. **`after_goto`** logs the resulting HTTP status code.
-
----
-
-## 5. Common Pitfalls & Best Practices
-
-1. **Hook Order**: If multiple hooks do overlapping tasks (e.g., two `before_goto` hooks), be mindful of conflicts or repeated logic.  
-2. **Async vs Sync**: Some hooks might be used in a synchronous or asynchronous style. Confirm your function signature. If the crawler expects `async`, define `async def`.  
-3. **Mutating goto_params**: `goto_params` is a dict that eventually goes to Playwright’s `page.goto()`. Changing the `url` or adding extra fields can be powerful but can also lead to confusion. Document your changes carefully.  
-4. **Browser vs Page vs Context**: Not all hooks have both `page` and `context`. For example, `on_browser_created` only has access to **`browser`**.  
-5. **Avoid Overdoing It**: Hooks are powerful but can lead to complexity. If you find yourself writing massive code inside a hook, consider if a separate “how-to” function with a simpler approach might suffice.
-
----
-
-## Conclusion & Next Steps
-
-**Hooks** let you bend Crawl4AI to your will:
-
-- **Authentication** (cookies, localStorage) with `before_goto`  
-- **Browser-level config** with `on_browser_created`  
-- **Page or context config** with `on_page_context_created`  
-- **Content modifications** before capturing HTML (`before_retrieve_html` or `before_return_html`)  
-
-**Where to go next**:
-
-- **[Identity-Based Crawling & Anti-Bot](./identity-anti-bot.md)**: Combine hooks with advanced user simulation to avoid bot detection.  
-- **[Reference → AsyncPlaywrightCrawlerStrategy](../../reference/browser-strategies.md)**: Learn more about how hooks are implemented under the hood.  
-- **[How-To Guides](../../how-to/)**: Check short, specific recipes for tasks like scraping multiple pages with repeated “Load More” clicks.
-
-With the hook system, you have near-complete control over the browser’s lifecycle—whether it’s setting up environment variables, customizing user agents, or manipulating the HTML. Enjoy the freedom to create sophisticated, fully customized crawling pipelines!
-
-**Last Updated**: 2024-XX-XX
diff --git a/docs/md_v3/tutorials/targeted-crawling.md b/docs/md_v3/tutorials/targeted-crawling.md
deleted file mode 100644
index f5fe2b77..00000000
--- a/docs/md_v3/tutorials/targeted-crawling.md
+++ /dev/null
@@ -1,227 +0,0 @@
-Below is a **draft** of a follow-up tutorial, **“Smart Crawling Techniques,”** building on the **“AsyncWebCrawler Basics”** tutorial. This tutorial focuses on three main points:
-
-1. **Advanced usage of CSS selectors** (e.g., partial extraction, exclusions)
-2. **Handling iframes** (if relevant for your workflow)
-3. **Waiting for dynamic content** using `wait_for`, including the new `css:` and `js:` prefixes
-
-Feel free to adjust code snippets, wording, or emphasis to match your library updates or user feedback.
-
----
-
-# Smart Crawling Techniques
-
-In the previous tutorial ([AsyncWebCrawler Basics](./async-webcrawler-basics.md)), you learned how to create an `AsyncWebCrawler` instance, run a basic crawl, and inspect the `CrawlResult`. Now it’s time to explore some of the **targeted crawling** features that let you:
-
-1. Select specific parts of a webpage using CSS selectors  
-2. Exclude or ignore certain page elements  
-3. Wait for dynamic content to load using `wait_for` (with `css:` or `js:` rules)  
-4. (Optionally) Handle iframes if your target site embeds additional content
-
-> **Prerequisites**  
-> - You’ve read or completed [AsyncWebCrawler Basics](./async-webcrawler-basics.md).  
-> - You have a working environment for Crawl4AI (Playwright installed, etc.).
-
----
-
-## 1. Targeting Specific Elements with CSS Selectors
-
-### 1.1 Simple CSS Selector Usage
-
-Let’s say you only need to crawl the main article content of a news page. By setting `css_selector` in `CrawlerRunConfig`, your final HTML or Markdown output focuses on that region. For example:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-async def main():
-    browser_cfg = BrowserConfig(headless=True)
-    crawler_cfg = CrawlerRunConfig(
-        css_selector=".article-body",  # Only capture .article-body content
-        excluded_tags=["nav", "footer"]  # Optional: skip big nav & footer sections
-    )
-
-    async with AsyncWebCrawler(config=browser_cfg) as crawler:
-        result = await crawler.arun(
-            url="https://news.example.com/story/12345",
-            config=crawler_cfg
-        )
-        if result.success:
-            print("[OK] Extracted content length:", len(result.html))
-        else:
-            print("[ERROR]", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Key Parameters**:
-- **`css_selector`**: Tells the crawler to focus on `.article-body`.  
-- **`excluded_tags`**: Tells the crawler to skip specific HTML tags altogether (e.g., `nav` or `footer`).  
-
-**Tip**: For extremely noisy pages, you can further refine how you exclude certain elements by using `excluded_selector`, which takes a CSS selector you want removed from the final output.
-
-### 1.2 Excluding Content with `excluded_selector`
-
-If you want to remove certain sections within `.article-body` (like “related stories” sidebars), set:
-
-```python
-CrawlerRunConfig(
-    css_selector=".article-body",
-    excluded_selector=".related-stories, .ads-banner"
-)
-```
-
-This combination grabs the main article content while filtering out sidebars or ads.
-
----
-
-## 2. Handling Iframes
-
-Some sites embed extra content via `<iframe>` elements—for example, embedded videos or external forms. If you want the crawler to traverse these iframes and merge their content into the final HTML or Markdown, set:
-
-```python
-crawler_cfg = CrawlerRunConfig(
-    process_iframes=True
-)
-```
-
-- **`process_iframes=True`**: Tells the crawler (specifically the underlying Playwright strategy) to recursively fetch iframe content and integrate it into `result.html` and `result.markdown`.
-
-**Warning**: Not all sites allow iframes to be crawled (some cross-origin policies might block it). If you see partial or missing data, check the domain policy or logs for warnings.
-
----
-
-## 3. Waiting for Dynamic Content
-
-Many modern sites load content dynamically (e.g., after user interaction or asynchronously). Crawl4AI helps you wait for specific conditions before capturing the final HTML. Let’s look at `wait_for`.
-
-### 3.1 `wait_for` Basics
-
-In `CrawlerRunConfig`, `wait_for` can be a simple CSS selector or a JavaScript condition. Under the hood, Crawl4AI uses `smart_wait` to interpret what you provide.
-
-```python
-crawler_cfg = CrawlerRunConfig(
-    wait_for="css:.main-article-loaded",
-    page_timeout=30000
-)
-```
-
-**Example**: `css:.main-article-loaded` means “Wait for an element with the class `.main-article-loaded` to appear in the DOM.” If it doesn’t appear within `30` seconds, you’ll get a timeout.
-
-### 3.2 Using Explicit Prefixes
-
-**`js:`** and **`css:`** can explicitly tell the crawler which approach to use:
-
-- **`wait_for="css:.comments-section"`** → Wait for `.comments-section` to appear  
-- **`wait_for="js:() => document.querySelectorAll('.comments').length > 5"`** → Wait until there are at least 6 comment elements  
-
-**Code Example**:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async def main():
-    config = CrawlerRunConfig(
-        wait_for="js:() => document.querySelectorAll('.dynamic-items li').length >= 10",
-        page_timeout=20000  # 20s
-    )
-    
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://example.com/async-list",
-            config=config
-        )
-        if result.success:
-            print("[OK] Dynamic items loaded. HTML length:", len(result.html))
-        else:
-            print("[ERROR]", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### 3.3 Fallback Logic
-
-If you **don’t** prefix `js:` or `css:`, Crawl4AI tries to detect whether your string looks like a CSS selector or a JavaScript snippet. It’ll first attempt a CSS selector. If that fails, it tries to evaluate it as a JavaScript function. This can be convenient but can also lead to confusion if the library guesses incorrectly. It’s often best to be explicit:
-
-- **`"css:.my-selector"`** → Force CSS  
-- **`"js:() => myAppState.isReady()"`** → Force JavaScript
-
-**What Should My JavaScript Return?**  
-- A function that returns `true` once the condition is met (or `false` if it fails).  
-- The function can be sync or async, but note that the crawler wraps it in an async loop to poll until `true` or timeout.
-
----
-
-## 4. Example: Targeted Crawl with Iframes & Wait-For
-
-Below is a more advanced snippet combining these features:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-async def main():
-    browser_cfg = BrowserConfig(headless=True)
-    crawler_cfg = CrawlerRunConfig(
-        css_selector=".main-content",
-        process_iframes=True,
-        wait_for="css:.loaded-indicator",   # Wait for .loaded-indicator to appear
-        excluded_tags=["script", "style"],  # Remove script/style tags
-        page_timeout=30000,
-        verbose=True
-    )
-    
-    async with AsyncWebCrawler(config=browser_cfg) as crawler:
-        result = await crawler.arun(
-            url="https://example.com/iframe-heavy",
-            config=crawler_cfg
-        )
-        if result.success:
-            print("[OK] Crawled with iframes. Length of final HTML:", len(result.html))
-        else:
-            print("[ERROR]", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**What’s Happening**:
-1. **`css_selector=".main-content"`** → Focus only on `.main-content` for final extraction.  
-2. **`process_iframes=True`** → Recursively handle `<iframe>` content.  
-3. **`wait_for="css:.loaded-indicator"`** → Don’t extract until the page shows `.loaded-indicator`.  
-4. **`excluded_tags=["script", "style"]`** → Remove script and style tags for a cleaner result.
-
----
-
-## 5. Common Pitfalls & Tips
-
-1. **Be Explicit**: Using `"js:"` or `"css:"` can spare you headaches if the library guesses incorrectly.  
-2. **Timeouts**: If the site never triggers your wait condition, a `TimeoutError` can occur. Check your logs or use `verbose=True` for more clues.  
-3. **Infinite Scroll**: If you have repeated “load more” loops, you might use [Hooks & Custom Code](./hooks-custom.md) or add your own JavaScript for repeated scrolling.  
-4. **Iframes**: Some iframes are cross-origin or protected. In those cases, you might not be able to read their content. Check your logs for permission errors.  
-
----
-
-## 6. Summary & Next Steps
-
-With these **Targeted Crawling Techniques** you can:
-
-- Precisely target or exclude content using CSS selectors.  
-- Automatically wait for dynamic elements to load using `wait_for`.  
-- Merge iframe content into your main page result.  
-
-### Where to Go Next?
-
-- **[Link & Media Analysis](./link-media-analysis.md)**: Dive deeper into analyzing extracted links and media items.  
-- **[Hooks & Custom Code](./hooks-custom.md)**: Learn how to implement repeated actions like infinite scroll or login sequences using hooks.  
-- **Reference**: For an exhaustive list of parameters and advanced usage, see [CrawlerRunConfig Reference](../../reference/configuration.md).  
-
-If you run into issues or want to see real examples from other users, check the [How-To Guides](../../how-to/) or raise a question on GitHub.
-
-**Last updated**: 2024-XX-XX
-
----
-
-That’s it for **Targeted Crawling Techniques**! You’re now equipped to handle complex pages that rely on dynamic loading, custom CSS selectors, and iframe embedding.  
\ No newline at end of file
diff --git a/main.py b/main.py
index 21e411d0..029653cd 100644
--- a/main.py
+++ b/main.py
@@ -1,14 +1,9 @@
 import asyncio, os
-from fastapi import FastAPI, HTTPException, BackgroundTasks, Request
-from fastapi.responses import JSONResponse
-from fastapi import FastAPI, HTTPException, Request
-from fastapi.responses import HTMLResponse, JSONResponse
+from fastapi import FastAPI, HTTPException
+from fastapi import FastAPI, HTTPException
 from fastapi.staticfiles import StaticFiles
-from fastapi.middleware.cors import CORSMiddleware  
+from fastapi.middleware.cors import CORSMiddleware
 from fastapi.templating import Jinja2Templates
-from fastapi.exceptions import RequestValidationError
-from starlette.middleware.base import BaseHTTPMiddleware
-from starlette.responses import FileResponse
 from fastapi.responses import RedirectResponse
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from fastapi import Depends, Security
@@ -18,13 +13,10 @@ from typing import Optional, List, Dict, Any, Union
 import psutil
 import time
 import uuid
-from collections import defaultdict
-from urllib.parse import urlparse
 import math
 import logging
 from enum import Enum
 from dataclasses import dataclass
-import json
 from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode
 from crawl4ai.config import MIN_WORD_THRESHOLD
 from crawl4ai.extraction_strategy import (
@@ -35,8 +27,10 @@ from crawl4ai.extraction_strategy import (
 
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 
-logging.basicConfig(level=logging.INFO)
+
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
 
 class TaskStatus(str, Enum):
     PENDING = "pending"
@@ -44,24 +38,29 @@ class TaskStatus(str, Enum):
     COMPLETED = "completed"
     FAILED = "failed"
 
+
 class CrawlerType(str, Enum):
     BASIC = "basic"
     LLM = "llm"
     COSINE = "cosine"
     JSON_CSS = "json_css"
 
+
 class ExtractionConfig(BaseModel):
     type: CrawlerType
     params: Dict[str, Any] = {}
 
+
 class ChunkingStrategy(BaseModel):
     type: str
     params: Dict[str, Any] = {}
 
+
 class ContentFilter(BaseModel):
     type: str = "bm25"
     params: Dict[str, Any] = {}
 
+
 class CrawlRequest(BaseModel):
     urls: Union[HttpUrl, List[HttpUrl]]
     word_count_threshold: int = MIN_WORD_THRESHOLD
@@ -77,9 +76,10 @@ class CrawlRequest(BaseModel):
     session_id: Optional[str] = None
     cache_mode: Optional[CacheMode] = CacheMode.ENABLED
     priority: int = Field(default=5, ge=1, le=10)
-    ttl: Optional[int] = 3600    
+    ttl: Optional[int] = 3600
     crawler_params: Dict[str, Any] = {}
 
+
 @dataclass
 class TaskInfo:
     id: str
@@ -89,6 +89,7 @@ class TaskInfo:
     created_at: float = time.time()
     ttl: int = 3600
 
+
 class ResourceMonitor:
     def __init__(self, max_concurrent_tasks: int = 10):
         self.max_concurrent_tasks = max_concurrent_tasks
@@ -106,7 +107,9 @@ class ResourceMonitor:
         mem_usage = psutil.virtual_memory().percent / 100
         cpu_usage = psutil.cpu_percent() / 100
 
-        memory_factor = max(0, (self.memory_threshold - mem_usage) / self.memory_threshold)
+        memory_factor = max(
+            0, (self.memory_threshold - mem_usage) / self.memory_threshold
+        )
         cpu_factor = max(0, (self.cpu_threshold - cpu_usage) / self.cpu_threshold)
 
         self._last_available_slots = math.floor(
@@ -116,6 +119,7 @@ class ResourceMonitor:
 
         return self._last_available_slots
 
+
 class TaskManager:
     def __init__(self, cleanup_interval: int = 300):
         self.tasks: Dict[str, TaskInfo] = {}
@@ -149,12 +153,16 @@ class TaskManager:
         except asyncio.TimeoutError:
             try:
                 # Then try low priority
-                _, task_id = await asyncio.wait_for(self.low_priority.get(), timeout=0.1)
+                _, task_id = await asyncio.wait_for(
+                    self.low_priority.get(), timeout=0.1
+                )
                 return task_id
             except asyncio.TimeoutError:
                 return None
 
-    def update_task(self, task_id: str, status: TaskStatus, result: Any = None, error: str = None):
+    def update_task(
+        self, task_id: str, status: TaskStatus, result: Any = None, error: str = None
+    ):
         if task_id in self.tasks:
             task_info = self.tasks[task_id]
             task_info.status = status
@@ -180,6 +188,7 @@ class TaskManager:
             except Exception as e:
                 logger.error(f"Error in cleanup loop: {e}")
 
+
 class CrawlerPool:
     def __init__(self, max_size: int = 10):
         self.max_size = max_size
@@ -222,6 +231,7 @@ class CrawlerPool:
                 await crawler.__aexit__(None, None, None)
             self.active_crawlers.clear()
 
+
 class CrawlerService:
     def __init__(self, max_concurrent_tasks: int = 10):
         self.resource_monitor = ResourceMonitor(max_concurrent_tasks)
@@ -258,10 +268,10 @@ class CrawlerService:
     async def submit_task(self, request: CrawlRequest) -> str:
         task_id = str(uuid.uuid4())
         await self.task_manager.add_task(task_id, request.priority, request.ttl or 3600)
-        
+
         # Store request data with task
         self.task_manager.tasks[task_id].request = request
-        
+
         return task_id
 
     async def _process_queue(self):
@@ -286,9 +296,11 @@ class CrawlerService:
 
                 try:
                     crawler = await self.crawler_pool.acquire(**request.crawler_params)
-                    
-                    extraction_strategy = self._create_extraction_strategy(request.extraction_config)
-                    
+
+                    extraction_strategy = self._create_extraction_strategy(
+                        request.extraction_config
+                    )
+
                     if isinstance(request.urls, list):
                         results = await crawler.arun_many(
                             urls=[str(url) for url in request.urls],
@@ -318,16 +330,21 @@ class CrawlerService:
                         )
 
                     await self.crawler_pool.release(crawler)
-                    self.task_manager.update_task(task_id, TaskStatus.COMPLETED, results)
+                    self.task_manager.update_task(
+                        task_id, TaskStatus.COMPLETED, results
+                    )
 
                 except Exception as e:
                     logger.error(f"Error processing task {task_id}: {str(e)}")
-                    self.task_manager.update_task(task_id, TaskStatus.FAILED, error=str(e))
+                    self.task_manager.update_task(
+                        task_id, TaskStatus.FAILED, error=str(e)
+                    )
 
             except Exception as e:
                 logger.error(f"Error in queue processing: {str(e)}")
                 await asyncio.sleep(1)
 
+
 app = FastAPI(title="Crawl4AI API")
 
 # CORS configuration
@@ -344,6 +361,7 @@ app.add_middleware(
 security = HTTPBearer()
 CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN")
 
+
 async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
     if not CRAWL4AI_API_TOKEN:
         return credentials  # No token verification if CRAWL4AI_API_TOKEN is not set
@@ -351,10 +369,12 @@ async def verify_token(credentials: HTTPAuthorizationCredentials = Security(secu
         raise HTTPException(status_code=401, detail="Invalid token")
     return credentials
 
+
 def secure_endpoint():
     """Returns security dependency only if CRAWL4AI_API_TOKEN is set"""
     return Depends(verify_token) if CRAWL4AI_API_TOKEN else None
 
+
 # Check if site directory exists
 if os.path.exists(__location__ + "/site"):
     # Mount the site directory as a static directory
@@ -364,14 +384,17 @@ site_templates = Jinja2Templates(directory=__location__ + "/site")
 
 crawler_service = CrawlerService()
 
+
 @app.on_event("startup")
 async def startup_event():
     await crawler_service.start()
 
+
 @app.on_event("shutdown")
 async def shutdown_event():
     await crawler_service.stop()
 
+
 @app.get("/")
 def read_root():
     if os.path.exists(__location__ + "/site"):
@@ -379,12 +402,16 @@ def read_root():
     # Return a json response
     return {"message": "Crawl4AI API service is running"}
 
+
 @app.post("/crawl", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
 async def crawl(request: CrawlRequest) -> Dict[str, str]:
     task_id = await crawler_service.submit_task(request)
     return {"task_id": task_id}
 
-@app.get("/task/{task_id}", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
+
+@app.get(
+    "/task/{task_id}", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else []
+)
 async def get_task_status(task_id: str):
     task_info = crawler_service.task_manager.get_task(task_id)
     if not task_info:
@@ -406,36 +433,45 @@ async def get_task_status(task_id: str):
 
     return response
 
+
 @app.post("/crawl_sync", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
 async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
     task_id = await crawler_service.submit_task(request)
-    
+
     # Wait up to 60 seconds for task completion
     for _ in range(60):
         task_info = crawler_service.task_manager.get_task(task_id)
         if not task_info:
             raise HTTPException(status_code=404, detail="Task not found")
-            
+
         if task_info.status == TaskStatus.COMPLETED:
             # Return same format as /task/{task_id} endpoint
             if isinstance(task_info.result, list):
-                return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
+                return {
+                    "status": task_info.status,
+                    "results": [result.dict() for result in task_info.result],
+                }
             return {"status": task_info.status, "result": task_info.result.dict()}
-            
+
         if task_info.status == TaskStatus.FAILED:
             raise HTTPException(status_code=500, detail=task_info.error)
-            
+
         await asyncio.sleep(1)
-    
+
     # If we get here, task didn't complete within timeout
     raise HTTPException(status_code=408, detail="Task timed out")
 
-@app.post("/crawl_direct", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
+
+@app.post(
+    "/crawl_direct", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else []
+)
 async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
     try:
         crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
-        extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
-        
+        extraction_strategy = crawler_service._create_extraction_strategy(
+            request.extraction_config
+        )
+
         try:
             if isinstance(request.urls, list):
                 results = await crawler.arun_many(
@@ -470,7 +506,8 @@ async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
     except Exception as e:
         logger.error(f"Error in direct crawl: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
-    
+
+
 @app.get("/health")
 async def health_check():
     available_slots = await crawler_service.resource_monitor.get_available_slots()
@@ -482,6 +519,8 @@ async def health_check():
         "cpu_usage": psutil.cpu_percent(),
     }
 
+
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=11235)
\ No newline at end of file
+
+    uvicorn.run(app, host="0.0.0.0", port=11235)
diff --git a/mkdocs.yml b/mkdocs.yml
index 6009dddf..255492e3 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,5 +1,5 @@
 site_name: Crawl4AI Documentation
-site_description: 🔥🕷️ Crawl4AI, Open-source LLM Friendly Web Crawler & Scrapper
+site_description:  🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
 site_url: https://docs.crawl4ai.com
 repo_url: https://github.com/unclecode/crawl4ai
 repo_name: unclecode/crawl4ai
@@ -7,67 +7,51 @@ docs_dir: docs/md_v2
 
 nav:
   - Home: 'index.md'
-  - 'Installation': 'basic/installation.md'
-  - 'Docker Deplotment': 'basic/docker-deploymeny.md'
-  - 'Quick Start': 'basic/quickstart.md'
-  - Changelog & Blog:
-    - 'Blog Home': 'blog/index.md'
-    - 'Latest (0.4.1)': 'blog/releases/0.4.1.md'
-    - 'Changelog': 'https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md'
-
-  - Basic:
-    - 'Simple Crawling': 'basic/simple-crawling.md'
-    - 'Output Formats': 'basic/output-formats.md'
-    - 'Browser Configuration': 'basic/browser-config.md'
-    - 'Page Interaction': 'basic/page-interaction.md'
-    - 'Content Selection': 'basic/content-selection.md'
-    - 'Cache Modes': 'basic/cache-modes.md'
-
+  - Setup & Installation:
+    - "Installation": "core/installation.md"
+    - "Docker Deployment": "core/docker-deploymeny.md"
+  - "Quick Start": "core/quickstart.md"
+  - "Blog & Changelog":
+    - "Blog Home": "blog/index.md"
+    - "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
+  - Core:
+    - "Simple Crawling": "core/simple-crawling.md"
+    - "Crawler Result": "core/crawler-result.md"
+    - "Browser & Crawler Config": "core/browser-crawler-config.md"
+    - "Markdown Generation": "core/markdown-generation.md"
+    - "Fit Markdown": "core/fit-markdown.md"
+    - "Page Interaction": "core/page-interaction.md"
+    - "Content Selection": "core/content-selection.md"
+    - "Cache Modes": "core/cache-modes.md"
+    - "Local Files & Raw HTML": "core/local-files.md"
+    - "Link & Media": "core/link-media.md"
   - Advanced:
-    - 'Content Processing': 'advanced/content-processing.md'
-    - 'Magic Mode': 'advanced/magic-mode.md'
-    - 'Hooks & Auth': 'advanced/hooks-auth.md'
-    - 'Proxy & Security': 'advanced/proxy-security.md'
-    - 'Session Management': 'advanced/session-management.md'
-    - 'Session Management (Advanced)': 'advanced/session-management-advanced.md'
-  
+    - "Overview": "advanced/advanced-features.md"
+    - "File Downloading": "advanced/file-downloading.md"
+    - "Lazy Loading": "advanced/lazy-loading.md"
+    - "Hooks & Auth": "advanced/hooks-auth.md"
+    - "Proxy & Security": "advanced/proxy-security.md"
+    - "Session Management": "advanced/session-management.md"
+    - "Multi-URL Crawling": "advanced/multi-url-crawling.md"
+    - "Crawl Dispatcher": "advanced/crawl-dispatcher.md"
+    - "Identity Based Crawling": "advanced/identity-based-crawling.md"
+    - "SSL Certificate": "advanced/ssl-certificate.md"
   - Extraction:
-    - 'Overview': 'extraction/overview.md'
-    - 'LLM Strategy': 'extraction/llm.md'
-    - 'Json-CSS Extractor Basic': 'extraction/css.md'
-    - 'Json-CSS Extractor Advanced': 'extraction/css-advanced.md'
-    - 'Cosine Strategy': 'extraction/cosine.md'
-    - 'Chunking': 'extraction/chunking.md'
-
+    - "LLM-Free Strategies": "extraction/no-llm-strategies.md"
+    - "LLM Strategies": "extraction/llm-strategies.md"
+    - "Clustering Strategies": "extraction/clustring-strategies.md"
+    - "Chunking": "extraction/chunking.md"
   - API Reference:
-    - 'Parameters Table': 'api/parameters.md'
-    - 'AsyncWebCrawler': 'api/async-webcrawler.md'
-    - 'AsyncWebCrawler.arun()': 'api/arun.md'
-    - 'CrawlResult': 'api/crawl-result.md'
-    - 'Strategies': 'api/strategies.md'
-    
-  - Tutorial:
-    - '1. Getting Started': 'tutorial/episode_01_Introduction_to_Crawl4AI_and_Basic_Installation.md'
-    - '2. Advanced Features': 'tutorial/episode_02_Overview_of_Advanced_Features.md'
-    - '3. Browser Setup': 'tutorial/episode_03_Browser_Configurations_&_Headless_Crawling.md'
-    - '4. Proxy Settings': 'tutorial/episode_04_Advanced_Proxy_and_Security_Settings.md'
-    - '5. Dynamic Content': 'tutorial/episode_05_JavaScript_Execution_and_Dynamic_Content_Handling.md'
-    - '6. Magic Mode': 'tutorial/episode_06_Magic_Mode_and_Anti-Bot_Protection.md'
-    - '7. Content Cleaning': 'tutorial/episode_07_Content_Cleaning_and_Fit_Markdown.md'
-    - '8. Media Handling': 'tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md'
-    - '9. Link Analysis': 'tutorial/episode_09_Link_Analysis_and_Smart_Filtering.md'
-    - '10. User Simulation': 'tutorial/episode_10_Custom_Headers,_Identity,_and_User_Simulation.md'
-    - '11.1. JSON CSS': 'tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md'
-    - '11.2. LLM Strategy': 'tutorial/episode_11_2_Extraction_Strategies_LLM.md'
-    - '11.3. Cosine Strategy': 'tutorial/episode_11_3_Extraction_Strategies_Cosine.md'
-    - '12. Session Crawling': 'tutorial/episode_12_Session-Based_Crawling_for_Dynamic_Websites.md'
-    - '13. Text Chunking': 'tutorial/episode_13_Chunking_Strategies_for_Large_Text_Processing.md'
-    - '14. Custom Workflows': 'tutorial/episode_14_Hooks_and_Custom_Workflow_with_AsyncWebCrawler.md'
-
+    - "AsyncWebCrawler": "api/async-webcrawler.md"
+    - "arun()": "api/arun.md"
+    - "arun_many()": "api/arun_many.md"
+    - "Browser & Crawler Config": "api/parameters.md"
+    - "CrawlResult": "api/crawl-result.md"
+    - "Strategies": "api/strategies.md"
 
 theme:
-  name: terminal
-  palette: dark
+  name: 'terminal'
+  palette: 'dark'
 
 markdown_extensions:
   - pymdownx.highlight:
@@ -87,4 +71,4 @@ extra_css:
 
 extra_javascript:
   - assets/highlight.min.js
-  - assets/highlight_init.js
\ No newline at end of file
+  - assets/highlight_init.js
diff --git a/mkdocs_v2.yml b/mkdocs_v2.yml
deleted file mode 100644
index ff8c93b9..00000000
--- a/mkdocs_v2.yml
+++ /dev/null
@@ -1,96 +0,0 @@
-site_name: Crawl4AI Documentation
-site_description: 🔥🕷️ Crawl4AI, Open-source LLM Friendly Web Crawler & Scrapper
-site_url: https://docs.crawl4ai.com
-repo_url: https://github.com/unclecode/crawl4ai
-repo_name: unclecode/crawl4ai
-docs_dir: docs/md_v3
-
-
-nav:
-  - Home: index.md
-
-  - Tutorials:
-    - "Getting Started": tutorials/getting-started.md
-    - "AsyncWebCrawler Basics": tutorials/async-webcrawler-basics.md
-    - "Targeted Crawling Techniques": tutorials/targeted-crawling.md
-    - "Link & Media Analysis": tutorials/link-media-analysis.md
-    - "Advanced Features (Proxy, PDF, Screenshots)": tutorials/advanced-features.md
-    - "Hooks & Custom Code": tutorials/hooks-custom.md
-    - "Markdown Generation Basics": tutorials/markdown-basics.md
-    - "Extracting JSON (No LLM)": tutorials/json-extraction-basic.md
-    - "Extracting JSON (LLM)": tutorials/json-extraction-llm.md
-    - "Deploying with Docker (Quickstart)": tutorials/docker-quickstart.md
-
-  - How-To Guides:
-    - "Advanced Browser Configuration": how-to/advanced-browser-config.md
-    - "Managing Browser Contexts & Remote Browsers": how-to/browser-contexts-remote.md
-    - "Identity-Based Crawling (Anti-Bot)": how-to/identity-anti-bot.md
-    - "Link & Media Analysis": how-to/link-media-analysis.md
-    - "Markdown Generation Customization": how-to/markdown-custom.md
-    - "Structured Data Extraction (Advanced)": how-to/structured-data-advanced.md
-    - "Deployment Options": how-to/deployment-options.md
-    - "Performance & Caching": how-to/performance-caching.md
-
-  - Explanations:
-    - "AsyncWebCrawler & Internal Flow": explanations/async-webcrawler-flow.md
-    - "Configuration Objects Explained": explanations/configuration-objects.md
-    - "Browser Context & Managed Browser": explanations/browser-management.md
-    - "Markdown Generation Architecture": explanations/markdown-architecture.md
-    - "Extraction & Chunking Strategies": explanations/extraction-chunking.md
-    - "Identity-Based Crawling & Anti-Bot": explanations/identity-anti-bot.md
-    - "Deployment Architectures": explanations/deployment-architectures.md
-
-  - Reference:
-    - "Configuration": reference/configuration.md
-    - "Core Crawler": reference/core-crawler.md
-    - "Browser Strategies": reference/browser-strategies.md
-    - "Markdown Generation": reference/markdown-generation.md
-    - "Content Filters": reference/content-filters.md
-    - "Extraction Strategies": reference/extraction-strategies.md
-    - "Chunking Strategies": reference/chunking-strategies.md
-    - "Identity & Utility": reference/identity-utilities.md
-    - "Models": reference/models.md
-
-  - Blog: 
-    - "Blog Overview": blog/index.md
-    #  You can add real-life application posts here in the future
-    #  - "Cool Real-World E-Commerce Scraping": blog/ecommerce-case-study.md
-    #  - "Dealing with Complex Anti-Bot Systems": blog/anti-bot-tricks.md
-
-
-theme:
-  name: terminal
-  palette: dark
-
-plugins:
-  - search
-  - mkdocstrings:
-      handlers:
-        python:
-          analysis:
-            follow_imports: true
-          rendering:
-            show_root_full_path: false
-
-markdown_extensions:
-  - codehilite
-  - toc:
-      permalink: true
-  - pymdownx.highlight:
-      anchor_linenums: true
-  - pymdownx.inlinehilite
-  - pymdownx.snippets
-  - pymdownx.superfences
-  - admonition
-  - pymdownx.details
-  - attr_list
-  - tables
-
-extra_css:
-  - assets/styles.css
-  - assets/highlight.css
-  - assets/dmvendor.css
-
-extra_javascript:
-  - assets/highlight.min.js
-  - assets/highlight_init.js
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index b3247e8a..c9bd9ad3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,10 @@ dependencies = [
     "psutil>=6.1.1",
     "nltk>=3.9.1",
     "playwright",
-    "aiofiles"
+    "aiofiles",
+    "rich>=13.9.4",
+    "cssselect>=1.2.0",
+    "httpx==0.27.2",
 ]
 classifiers = [
     "Development Status :: 3 - Alpha",
@@ -75,4 +78,12 @@ packages = {find = {where = ["."], include = ["crawl4ai*"]}}
 crawl4ai = ["js_snippet/*.js"]
 
 [tool.setuptools.dynamic]
-version = {attr = "crawl4ai.__version__.__version__"}
\ No newline at end of file
+version = {attr = "crawl4ai.__version__.__version__"}
+
+[tool.uv.sources]
+crawl4ai = { workspace = true }
+
+[dependency-groups]
+dev = [
+    "crawl4ai",
+]
diff --git a/requirements.txt b/requirements.txt
index 00ce69d6..19832b50 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,4 +18,6 @@ snowballstemmer~=2.2
 pydantic>=2.10
 pyOpenSSL>=24.3.0
 psutil>=6.1.1
-nltk>=3.9.1
\ No newline at end of file
+nltk>=3.9.1
+rich>=13.9.4
+cssselect>=1.2.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index dad3199d..16b1b53c 100644
--- a/setup.py
+++ b/setup.py
@@ -51,9 +51,7 @@ setup(
     author_email="unclecode@kidocode.com",
     license="MIT",
     packages=find_packages(),
-    package_data={
-        'crawl4ai': ['js_snippet/*.js']
-    },
+    package_data={"crawl4ai": ["js_snippet/*.js"]},
     classifiers=[
         "Development Status :: 3 - Alpha",
         "Intended Audience :: Developers",
diff --git a/tests/20241401/test_async_crawler_strategy.py b/tests/20241401/test_async_crawler_strategy.py
new file mode 100644
index 00000000..68fe4a88
--- /dev/null
+++ b/tests/20241401/test_async_crawler_strategy.py
@@ -0,0 +1,343 @@
+import pytest
+import pytest_asyncio
+import asyncio
+from typing import Dict, Any
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+import os
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
+from crawl4ai.models import AsyncCrawlResponse
+from crawl4ai.async_logger import AsyncLogger, LogLevel
+
+CRAWL4AI_HOME_DIR = Path(os.path.expanduser("~")).joinpath(".crawl4ai")
+
+if not CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").exists():
+    CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").mkdir(parents=True)
+
+# Test Config Files
+@pytest.fixture
+def basic_browser_config():
+    return BrowserConfig(
+        browser_type="chromium",
+        headless=True,
+        verbose=True
+    )
+
+@pytest.fixture
+def advanced_browser_config():
+    return BrowserConfig(
+        browser_type="chromium", 
+        headless=True,
+        use_managed_browser=True,
+        user_data_dir=CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile"),
+        # proxy="http://localhost:8080",
+        viewport_width=1920,
+        viewport_height=1080,
+        user_agent_mode="random"
+    )
+
+@pytest.fixture
+def basic_crawler_config():
+    return CrawlerRunConfig(
+        word_count_threshold=100,
+        wait_until="domcontentloaded",
+        page_timeout=30000
+    )
+
+@pytest.fixture
+def logger():
+    return AsyncLogger(verbose=True, log_level=LogLevel.DEBUG)
+
+@pytest_asyncio.fixture
+async def crawler_strategy(basic_browser_config, logger):
+    strategy = AsyncPlaywrightCrawlerStrategy(browser_config=basic_browser_config, logger=logger)
+    await strategy.start()
+    yield strategy
+    await strategy.close()
+
+# Browser Configuration Tests
+@pytest.mark.asyncio
+async def test_browser_config_initialization():
+    config = BrowserConfig(
+        browser_type="chromium",
+        user_agent_mode="random"
+    )
+    assert config.browser_type == "chromium"
+    assert config.user_agent is not None
+    assert config.headless is True
+
+@pytest.mark.asyncio 
+async def test_persistent_browser_config():
+    config = BrowserConfig(
+        use_persistent_context=True,
+        user_data_dir="/tmp/test_dir"
+    )
+    assert config.use_managed_browser is True
+    assert config.user_data_dir == "/tmp/test_dir"
+
+# Crawler Strategy Tests
+@pytest.mark.asyncio
+async def test_basic_page_load(crawler_strategy):
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        CrawlerRunConfig()
+    )
+    assert response.status_code == 200
+    assert len(response.html) > 0
+    assert "Example Domain" in response.html
+
+@pytest.mark.asyncio
+async def test_screenshot_capture(crawler_strategy):
+    config = CrawlerRunConfig(screenshot=True)
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.screenshot is not None
+    assert len(response.screenshot) > 0
+
+@pytest.mark.asyncio
+async def test_pdf_generation(crawler_strategy):
+    config = CrawlerRunConfig(pdf=True)
+    response = await crawler_strategy.crawl(
+        "https://example.com", 
+        config
+    )
+    assert response.pdf_data is not None
+    assert len(response.pdf_data) > 0
+
+@pytest.mark.asyncio
+async def test_handle_js_execution(crawler_strategy):
+    config = CrawlerRunConfig(
+        js_code="document.body.style.backgroundColor = 'red';"
+    )
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.status_code == 200
+    assert 'background-color: red' in response.html.lower()
+
+@pytest.mark.asyncio
+async def test_multiple_js_commands(crawler_strategy):
+    js_commands = [
+        "document.body.style.backgroundColor = 'blue';",
+        "document.title = 'Modified Title';",
+        "const div = document.createElement('div'); div.id = 'test'; div.textContent = 'Test Content'; document.body.appendChild(div);"
+    ]
+    config = CrawlerRunConfig(js_code=js_commands)
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.status_code == 200
+    assert 'background-color: blue' in response.html.lower()
+    assert 'id="test"' in response.html
+    assert '>Test Content<' in response.html
+    assert '<title>Modified Title</title>' in response.html
+
+@pytest.mark.asyncio
+async def test_complex_dom_manipulation(crawler_strategy):
+    js_code = """
+    // Create a complex structure
+    const container = document.createElement('div');
+    container.className = 'test-container';
+    
+    const list = document.createElement('ul');
+    list.className = 'test-list';
+    
+    for (let i = 1; i <= 3; i++) {
+        const item = document.createElement('li');
+        item.textContent = `Item ${i}`;
+        item.className = `item-${i}`;
+        list.appendChild(item);
+    }
+    
+    container.appendChild(list);
+    document.body.appendChild(container);
+    """
+    config = CrawlerRunConfig(js_code=js_code)
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.status_code == 200
+    assert 'class="test-container"' in response.html
+    assert 'class="test-list"' in response.html
+    assert 'class="item-1"' in response.html
+    assert '>Item 1<' in response.html
+    assert '>Item 2<' in response.html
+    assert '>Item 3<' in response.html
+
+@pytest.mark.asyncio
+async def test_style_modifications(crawler_strategy):
+    js_code = """
+    const testDiv = document.createElement('div');
+    testDiv.id = 'style-test';
+    testDiv.style.cssText = 'color: green; font-size: 20px; margin: 10px;';
+    testDiv.textContent = 'Styled Content';
+    document.body.appendChild(testDiv);
+    """
+    config = CrawlerRunConfig(js_code=js_code)
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.status_code == 200
+    assert 'id="style-test"' in response.html
+    assert 'color: green' in response.html.lower()
+    assert 'font-size: 20px' in response.html.lower()
+    assert 'margin: 10px' in response.html.lower()
+    assert '>Styled Content<' in response.html
+
+@pytest.mark.asyncio
+async def test_dynamic_content_loading(crawler_strategy):
+    js_code = """
+    // Simulate dynamic content loading
+    setTimeout(() => {
+        const dynamic = document.createElement('div');
+        dynamic.id = 'dynamic-content';
+        dynamic.textContent = 'Dynamically Loaded';
+        document.body.appendChild(dynamic);
+    }, 1000);
+    
+    // Add a loading indicator immediately
+    const loading = document.createElement('div');
+    loading.id = 'loading';
+    loading.textContent = 'Loading...';
+    document.body.appendChild(loading);
+    """
+    config = CrawlerRunConfig(js_code=js_code, delay_before_return_html=2.0)
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.status_code == 200
+    assert 'id="loading"' in response.html
+    assert '>Loading...</' in response.html
+    assert 'dynamic-content' in response.html
+    assert '>Dynamically Loaded<' in response.html
+
+# @pytest.mark.asyncio
+# async def test_js_return_values(crawler_strategy):
+#     js_code = """
+#     return {
+#         title: document.title,
+#         metaCount: document.getElementsByTagName('meta').length,
+#         bodyClass: document.body.className
+#     };
+#     """
+#     config = CrawlerRunConfig(js_code=js_code)
+#     response = await crawler_strategy.crawl(
+#         "https://example.com",
+#         config
+#     )
+#     assert response.status_code == 200
+#     assert 'Example Domain' in response.html
+#     assert 'meta name="viewport"' in response.html
+#     assert 'class="main"' in response.html
+
+@pytest.mark.asyncio
+async def test_async_js_execution(crawler_strategy):
+    js_code = """
+    await new Promise(resolve => setTimeout(resolve, 1000));
+    document.body.style.color = 'green';
+    const computedStyle = window.getComputedStyle(document.body);
+    return computedStyle.color;
+    """
+    config = CrawlerRunConfig(js_code=js_code)
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.status_code == 200
+    assert 'color: green' in response.html.lower()
+
+# @pytest.mark.asyncio
+# async def test_js_error_handling(crawler_strategy):
+#     js_code = """
+#     // Intentionally cause different types of errors
+#     const results = [];
+#     try {
+#         nonExistentFunction();
+#     } catch (e) {
+#         results.push(e.name);
+#     }
+#     try {
+#         JSON.parse('{invalid}');
+#     } catch (e) {
+#         results.push(e.name);
+#     }
+#     return results;
+#     """
+#     config = CrawlerRunConfig(js_code=js_code)
+#     response = await crawler_strategy.crawl(
+#         "https://example.com",
+#         config
+#     )
+#     assert response.status_code == 200
+#     assert 'ReferenceError' in response.html
+#     assert 'SyntaxError' in response.html
+
+@pytest.mark.asyncio
+async def test_handle_navigation_timeout():
+    config = CrawlerRunConfig(page_timeout=1)  # 1ms timeout
+    with pytest.raises(Exception):
+        async with AsyncPlaywrightCrawlerStrategy() as strategy:
+            await strategy.crawl("https://example.com", config)
+
+@pytest.mark.asyncio
+async def test_session_management(crawler_strategy):
+    config = CrawlerRunConfig(session_id="test_session")
+    response1 = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    response2 = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response1.status_code == 200
+    assert response2.status_code == 200
+
+@pytest.mark.asyncio
+async def test_process_iframes(crawler_strategy):
+    config = CrawlerRunConfig(
+        process_iframes=True,
+        wait_for_images=True
+    )
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config  
+    )
+    assert response.status_code == 200
+
+@pytest.mark.asyncio
+async def test_stealth_mode(crawler_strategy):
+    config = CrawlerRunConfig(
+        simulate_user=True,
+        override_navigator=True
+    )
+    response = await crawler_strategy.crawl(
+        "https://bot.sannysoft.com",
+        config
+    )
+    assert response.status_code == 200
+
+# Error Handling Tests  
+@pytest.mark.asyncio
+async def test_invalid_url():
+    with pytest.raises(ValueError):
+        async with AsyncPlaywrightCrawlerStrategy() as strategy:
+            await strategy.crawl("not_a_url", CrawlerRunConfig())
+
+@pytest.mark.asyncio 
+async def test_network_error_handling():
+    config = CrawlerRunConfig()
+    with pytest.raises(Exception):
+        async with AsyncPlaywrightCrawlerStrategy() as strategy:
+            await strategy.crawl("https://invalid.example.com", config)
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
\ No newline at end of file
diff --git a/tests/20241401/test_async_markdown_generator.py b/tests/20241401/test_async_markdown_generator.py
new file mode 100644
index 00000000..145b98b5
--- /dev/null
+++ b/tests/20241401/test_async_markdown_generator.py
@@ -0,0 +1,171 @@
+import asyncio
+from typing import Dict
+from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+import time
+
+# Test HTML samples
+TEST_HTML_SAMPLES = {
+    "basic": """
+        <body>
+            <h1>Test Title</h1>
+            <p>This is a test paragraph with <a href="http://example.com">a link</a>.</p>
+            <div class="content">
+                <h2>Section 1</h2>
+                <p>More content here with <b>bold text</b>.</p>
+            </div>
+        </body>
+    """,
+    
+    "complex": """
+        <body>
+            <nav>Navigation menu that should be removed</nav>
+            <header>Header content to remove</header>
+            <main>
+                <article>
+                    <h1>Main Article</h1>
+                    <p>Important content paragraph with <a href="http://test.com">useful link</a>.</p>
+                    <section>
+                        <h2>Key Section</h2>
+                        <p>Detailed explanation with multiple sentences. This should be kept 
+                           in the final output. Very important information here.</p>
+                    </section>
+                </article>
+                <aside>Sidebar content to remove</aside>
+            </main>
+            <footer>Footer content to remove</footer>
+        </body>
+    """,
+    
+    "edge_cases": """
+        <body>
+            <div>
+                <p></p>
+                <p>   </p>
+                <script>alert('remove me');</script>
+                <div class="advertisement">Ad content to remove</div>
+                <p class="social-share">Share buttons to remove</p>
+                <h1>!!Special>> Characters## Title!!</h1>
+                <pre><code>def test(): pass</code></pre>
+            </div>
+        </body>
+    """,
+    
+    "links_citations": """
+        <body>
+            <h1>Document with Links</h1>
+            <p>First link to <a href="http://example.com/1">Example 1</a></p>
+            <p>Second link to <a href="http://example.com/2" title="Example 2">Test 2</a></p>
+            <p>Image link: <img src="test.jpg" alt="test image"></p>
+            <p>Repeated link to <a href="http://example.com/1">Example 1 again</a></p>
+        </body>
+    """,
+}
+
+def test_content_filters() -> Dict[str, Dict[str, int]]:
+    """Test various content filtering strategies and return length comparisons."""
+    results = {}
+    
+    # Initialize filters
+    pruning_filter = PruningContentFilter(
+        threshold=0.48,
+        threshold_type="fixed",
+        min_word_threshold=2
+    )
+    
+    bm25_filter = BM25ContentFilter(
+        bm25_threshold=1.0,
+        user_query="test article content important"
+    )
+    
+    # Test each HTML sample
+    for test_name, html in TEST_HTML_SAMPLES.items():
+        # Store results for this test case
+        results[test_name] = {}
+        
+        # Test PruningContentFilter
+        start_time = time.time()
+        pruned_content = pruning_filter.filter_content(html)
+        pruning_time = time.time() - start_time
+        
+        # Test BM25ContentFilter
+        start_time = time.time()
+        bm25_content = bm25_filter.filter_content(html)
+        bm25_time = time.time() - start_time
+        
+        # Store results
+        results[test_name] = {
+            "original_length": len(html),
+            "pruned_length": sum(len(c) for c in pruned_content),
+            "bm25_length": sum(len(c) for c in bm25_content),
+            "pruning_time": pruning_time,
+            "bm25_time": bm25_time
+        }
+        
+    return results
+
+def test_markdown_generation():
+    """Test markdown generation with different configurations."""
+    results = []
+    
+    # Initialize generators with different configurations
+    generators = {
+        "no_filter": DefaultMarkdownGenerator(),
+        "pruning": DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(threshold=0.48)
+        ),
+        "bm25": DefaultMarkdownGenerator(
+            content_filter=BM25ContentFilter(
+                user_query="test article content important"
+            )
+        )
+    }
+    
+    # Test each generator with each HTML sample
+    for test_name, html in TEST_HTML_SAMPLES.items():
+        for gen_name, generator in generators.items():
+            start_time = time.time()
+            result = generator.generate_markdown(
+                html,
+                base_url="http://example.com",
+                citations=True
+            )
+            
+            results.append({
+                "test_case": test_name,
+                "generator": gen_name,
+                "time": time.time() - start_time,
+                "raw_length": len(result.raw_markdown),
+                "fit_length": len(result.fit_markdown) if result.fit_markdown else 0,
+                "citations": len(result.references_markdown)
+            })
+    
+    return results
+
+def main():
+    """Run all tests and print results."""
+    print("Starting content filter tests...")
+    filter_results = test_content_filters()
+    
+    print("\nContent Filter Results:")
+    print("-" * 50)
+    for test_name, metrics in filter_results.items():
+        print(f"\nTest case: {test_name}")
+        print(f"Original length: {metrics['original_length']}")
+        print(f"Pruned length: {metrics['pruned_length']} ({metrics['pruning_time']:.3f}s)")
+        print(f"BM25 length: {metrics['bm25_length']} ({metrics['bm25_time']:.3f}s)")
+        
+    print("\nStarting markdown generation tests...")
+    markdown_results = test_markdown_generation()
+    
+    print("\nMarkdown Generation Results:")
+    print("-" * 50)
+    for result in markdown_results:
+        print(f"\nTest: {result['test_case']} - Generator: {result['generator']}")
+        print(f"Time: {result['time']:.3f}s")
+        print(f"Raw length: {result['raw_length']}")
+        print(f"Fit length: {result['fit_length']}")
+        print(f"Citations: {result['citations']}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/tests/20241401/test_async_webcrawler.py b/tests/20241401/test_async_webcrawler.py
new file mode 100644
index 00000000..4d7aa815
--- /dev/null
+++ b/tests/20241401/test_async_webcrawler.py
@@ -0,0 +1,149 @@
+import asyncio
+import pytest
+from typing import List
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig, 
+    CrawlerRunConfig,
+    MemoryAdaptiveDispatcher,
+    RateLimiter,
+    CacheMode
+)
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("viewport", [
+    (800, 600),
+    (1024, 768),
+    (1920, 1080)
+])
+async def test_viewport_config(viewport):
+    """Test different viewport configurations"""
+    width, height = viewport
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True,
+        viewport_width=width,
+        viewport_height=height
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=CrawlerRunConfig(
+                # cache_mode=CacheMode.BYPASS,
+                page_timeout=30000  # 30 seconds
+            )
+        )
+        assert result.success
+
+@pytest.mark.asyncio
+async def test_memory_management():
+    """Test memory-adaptive dispatching"""
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True,
+        viewport_width=1024,
+        viewport_height=768
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=5
+    )
+    
+    urls = ["https://example.com"] * 3  # Test with multiple identical URLs
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        results = await crawler.arun_many(
+            urls=urls,
+            config=CrawlerRunConfig(page_timeout=30000),
+            dispatcher=dispatcher
+        )
+        assert len(results) == len(urls)
+
+@pytest.mark.asyncio
+async def test_rate_limiting():
+    """Test rate limiting functionality"""
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        rate_limiter=RateLimiter(
+            base_delay=(1.0, 2.0),
+            max_delay=5.0,
+            max_retries=2
+        ),
+        memory_threshold_percent=70.0
+    )
+    
+    urls = [
+        "https://example.com",
+        "https://example.org",
+        "https://example.net"
+    ]
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        results = await crawler.arun_many(
+            urls=urls,
+            config=CrawlerRunConfig(page_timeout=30000),
+            dispatcher=dispatcher
+        )
+        assert len(results) == len(urls)
+
+@pytest.mark.asyncio
+async def test_javascript_execution():
+    """Test JavaScript execution capabilities"""
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True,
+        java_script_enabled=True
+    )
+    
+    js_code = """
+        document.body.style.backgroundColor = 'red';
+        return document.body.style.backgroundColor;
+    """
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=CrawlerRunConfig(
+                js_code=js_code,
+                page_timeout=30000
+            )
+        )
+        assert result.success
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("error_url", [
+    "https://invalid.domain.test",
+    "https://httpbin.org/status/404",
+    "https://httpbin.org/status/503",
+    "https://httpbin.org/status/403"
+])
+async def test_error_handling(error_url):
+    """Test error handling for various failure scenarios"""
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url=error_url,
+            config=CrawlerRunConfig(
+                page_timeout=10000,  # Short timeout for error cases
+                cache_mode=CacheMode.BYPASS
+            )
+        )
+        assert not result.success
+        assert result.error_message is not None
+
+if __name__ == "__main__":
+    asyncio.run(test_viewport_config((1024, 768)))
+    asyncio.run(test_memory_management())
+    asyncio.run(test_rate_limiting())
+    asyncio.run(test_javascript_execution())
\ No newline at end of file
diff --git a/tests/20241401/test_cache_context.py b/tests/20241401/test_cache_context.py
new file mode 100644
index 00000000..0f42f9fd
--- /dev/null
+++ b/tests/20241401/test_cache_context.py
@@ -0,0 +1,85 @@
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from playwright.async_api import Page, BrowserContext
+
+async def test_reuse_context_by_config():
+    # We will store each context ID in these maps to confirm reuse
+    context_ids_for_A = []
+    context_ids_for_B = []
+
+    # Create a small hook to track context creation
+    async def on_page_context_created(page: Page, context: BrowserContext, config: CrawlerRunConfig, **kwargs):
+        c_id = id(context)
+        print(f"[HOOK] on_page_context_created - Context ID: {c_id}")
+        # Distinguish which config we used by checking a custom hook param
+        config_label = config.shared_data.get("config_label", "unknown")
+        if config_label == "A":
+            context_ids_for_A.append(c_id)
+        elif config_label == "B":
+            context_ids_for_B.append(c_id)
+        return page
+
+    # Browser config - Headless, verbose so we see logs
+    browser_config = BrowserConfig(headless=True, verbose=True)
+
+    # Two crawler run configs that differ (for example, text_mode):
+    configA = CrawlerRunConfig(
+        only_text=True,
+        cache_mode=CacheMode.BYPASS,
+        wait_until="domcontentloaded",
+        shared_data = {
+            "config_label" : "A"
+        }
+    )
+    configB = CrawlerRunConfig(
+        only_text=False,
+        cache_mode=CacheMode.BYPASS,
+        wait_until="domcontentloaded",
+        shared_data = {
+            "config_label" : "B"
+        }
+    )
+
+    # Create the crawler
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    # Attach our custom hook
+    # Note: "on_page_context_created" will be called each time a new context+page is generated
+    crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
+
+    # Start the crawler (launches the browser)
+    await crawler.start()
+
+    # For demonstration, we’ll crawl a benign site multiple times with each config
+    test_url = "https://example.com"
+    print("\n--- Crawling with config A (text_mode=True) ---")
+    for _ in range(2):
+        # Pass an extra kwarg to the hook so we know which config is being used
+        await crawler.arun(test_url, config=configA)
+
+    print("\n--- Crawling with config B (text_mode=False) ---")
+    for _ in range(2):
+        await crawler.arun(test_url, config=configB)
+
+    # Close the crawler (shuts down the browser, closes contexts)
+    await crawler.close()
+
+    # Validate and show the results
+    print("\n=== RESULTS ===")
+    print(f"Config A context IDs: {context_ids_for_A}")
+    print(f"Config B context IDs: {context_ids_for_B}")
+    if len(set(context_ids_for_A)) == 1:
+        print("✅ All config A crawls used the SAME BrowserContext.")
+    else:
+        print("❌ Config A crawls created multiple contexts unexpectedly.")
+    if len(set(context_ids_for_B)) == 1:
+        print("✅ All config B crawls used the SAME BrowserContext.")
+    else:
+        print("❌ Config B crawls created multiple contexts unexpectedly.")
+    if set(context_ids_for_A).isdisjoint(context_ids_for_B):
+        print("✅ Config A context is different from Config B context.")
+    else:
+        print("❌ A and B ended up sharing the same context somehow!")
+
+if __name__ == "__main__":
+    asyncio.run(test_reuse_context_by_config())
diff --git a/tests/20241401/test_llm_filter.py b/tests/20241401/test_llm_filter.py
new file mode 100644
index 00000000..60b8549d
--- /dev/null
+++ b/tests/20241401/test_llm_filter.py
@@ -0,0 +1,87 @@
+import os
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.content_filter_strategy import LLMContentFilter
+
+async def test_llm_filter():
+    # Create an HTML source that needs intelligent filtering
+    url = "https://docs.python.org/3/tutorial/classes.html"
+    
+    browser_config = BrowserConfig(
+        headless=True,
+        verbose=True
+    )
+    
+    # run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # First get the raw HTML
+        result = await crawler.arun(url, config=run_config)
+        html = result.cleaned_html
+
+        # Initialize LLM filter with focused instruction
+        filter = LLMContentFilter(
+            provider="openai/gpt-4o",
+            api_token=os.getenv('OPENAI_API_KEY'),
+            instruction="""
+            Focus on extracting the core educational content about Python classes.
+            Include:
+            - Key concepts and their explanations
+            - Important code examples
+            - Essential technical details
+            Exclude:
+            - Navigation elements
+            - Sidebars
+            - Footer content
+            - Version information
+            - Any non-essential UI elements
+            
+            Format the output as clean markdown with proper code blocks and headers.
+            """,
+            verbose=True
+        )
+        
+        filter = LLMContentFilter(
+            provider="openai/gpt-4o",
+            api_token=os.getenv('OPENAI_API_KEY'),
+            chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
+            instruction="""
+            Extract the main educational content while preserving its original wording and substance completely. Your task is to:
+
+            1. Maintain the exact language and terminology used in the main content
+            2. Keep all technical explanations, examples, and educational content intact
+            3. Preserve the original flow and structure of the core content
+            4. Remove only clearly irrelevant elements like:
+            - Navigation menus
+            - Advertisement sections
+            - Cookie notices
+            - Footers with site information
+            - Sidebars with external links
+            - Any UI elements that don't contribute to learning
+
+            The goal is to create a clean markdown version that reads exactly like the original article, 
+            keeping all valuable content but free from distracting elements. Imagine you're creating 
+            a perfect reading experience where nothing valuable is lost, but all noise is removed.
+            """,
+            verbose=True
+        )        
+
+        # Apply filtering
+        filtered_content = filter.filter_content(html, ignore_cache = True)
+        
+        # Show results
+        print("\nFiltered Content Length:", len(filtered_content))
+        print("\nFirst 500 chars of filtered content:")
+        if filtered_content:
+            print(filtered_content[0][:500])
+        
+        # Save on disc the markdown version
+        with open("filtered_content.md", "w", encoding="utf-8") as f:
+            f.write("\n".join(filtered_content))
+        
+        # Show token usage
+        filter.show_usage()
+
+if __name__ == "__main__":
+    asyncio.run(test_llm_filter())
\ No newline at end of file
diff --git a/tests/20241401/test_schema_builder.py b/tests/20241401/test_schema_builder.py
new file mode 100644
index 00000000..431fb001
--- /dev/null
+++ b/tests/20241401/test_schema_builder.py
@@ -0,0 +1,111 @@
+# https://claude.ai/chat/c4bbe93d-fb54-44ce-92af-76b4c8086c6b
+# https://claude.ai/chat/c24a768c-d8b2-478a-acc7-d76d42a308da
+import os, sys
+
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+import json
+
+# Test HTML - A complex job board with companies, departments, and positions
+test_html = """
+<div class="company-listings">
+    <div class="company" data-company-id="123">
+        <div class="company-header">
+            <img class="company-logo" src="google.png" alt="Google">
+            <h1 class="company-name">Google</h1>
+            <div class="company-meta">
+                <span class="company-size">10,000+ employees</span>
+                <span class="company-industry">Technology</span>
+                <a href="https://google.careers" class="careers-link">Careers Page</a>
+            </div>
+        </div>
+        
+        <div class="departments">
+            <div class="department">
+                <h2 class="department-name">Engineering</h2>
+                <div class="positions">
+                    <div class="position-card" data-position-id="eng-1">
+                        <h3 class="position-title">Senior Software Engineer</h3>
+                        <span class="salary-range">$150,000 - $250,000</span>
+                        <div class="position-meta">
+                            <span class="location">Mountain View, CA</span>
+                            <span class="job-type">Full-time</span>
+                            <span class="experience">5+ years</span>
+                        </div>
+                        <div class="skills-required">
+                            <span class="skill">Python</span>
+                            <span class="skill">Kubernetes</span>
+                            <span class="skill">Machine Learning</span>
+                        </div>
+                        <p class="position-description">Join our core engineering team...</p>
+                        <div class="application-info">
+                            <span class="posting-date">Posted: 2024-03-15</span>
+                            <button class="apply-btn" data-req-id="REQ12345">Apply Now</button>
+                        </div>
+                    </div>
+                    <!-- More positions -->
+                </div>
+            </div>
+            
+            <div class="department">
+                <h2 class="department-name">Marketing</h2>
+                <div class="positions">
+                    <div class="position-card" data-position-id="mkt-1">
+                        <h3 class="position-title">Growth Marketing Manager</h3>
+                        <span class="salary-range">$120,000 - $180,000</span>
+                        <div class="position-meta">
+                            <span class="location">New York, NY</span>
+                            <span class="job-type">Full-time</span>
+                            <span class="experience">3+ years</span>
+                        </div>
+                        <div class="skills-required">
+                            <span class="skill">SEO</span>
+                            <span class="skill">Analytics</span>
+                            <span class="skill">Content Strategy</span>
+                        </div>
+                        <p class="position-description">Drive our growth initiatives...</p>
+                        <div class="application-info">
+                            <span class="posting-date">Posted: 2024-03-14</span>
+                            <button class="apply-btn" data-req-id="REQ12346">Apply Now</button>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+</div>
+"""
+
+# Test cases
+def test_schema_generation():
+    # Test 1: No query (should extract everything)
+    print("\nTest 1: No Query (Full Schema)")
+    schema1 = JsonCssExtractionStrategy.generate_schema(test_html)
+    print(json.dumps(schema1, indent=2))
+    
+    # Test 2: Query for just basic job info
+    print("\nTest 2: Basic Job Info Query")
+    query2 = "I only need job titles, salaries, and locations"
+    schema2 = JsonCssExtractionStrategy.generate_schema(test_html, query2)
+    print(json.dumps(schema2, indent=2))
+    
+    # Test 3: Query for company and department structure
+    print("\nTest 3: Organizational Structure Query")
+    query3 = "Extract company details and department names, without position details"
+    schema3 = JsonCssExtractionStrategy.generate_schema(test_html, query3)
+    print(json.dumps(schema3, indent=2))
+    
+    # Test 4: Query for specific skills tracking
+    print("\nTest 4: Skills Analysis Query")
+    query4 = "I want to analyze required skills across all positions"
+    schema4 = JsonCssExtractionStrategy.generate_schema(test_html, query4)
+    print(json.dumps(schema4, indent=2))
+
+if __name__ == "__main__":
+    test_schema_generation()
\ No newline at end of file
diff --git a/tests/20241401/test_stream.py b/tests/20241401/test_stream.py
new file mode 100644
index 00000000..5614eb72
--- /dev/null
+++ b/tests/20241401/test_stream.py
@@ -0,0 +1,50 @@
+import os, sys
+# append 2 parent directories to sys.path to import crawl4ai
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+parent_parent_dir = os.path.dirname(parent_dir)
+sys.path.append(parent_parent_dir)
+
+import asyncio
+from crawl4ai import *
+
+async def test_crawler():
+    # Setup configurations
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, 
+                threshold_type="fixed", 
+                min_word_threshold=0
+            )
+        ),
+    )
+
+    # Test URLs - mix of different sites
+    urls = [
+        "http://example.com",
+        "http://example.org",
+        "http://example.net",
+    ] * 10  # 15 total URLs
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        print("\n=== Testing Streaming Mode ===")
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=crawler_config.clone(stream=True),
+        ):
+            print(f"Received result for: {result.url} - Success: {result.success}")
+            
+        print("\n=== Testing Batch Mode ===")
+        results = await crawler.arun_many(
+            urls=urls,
+            config=crawler_config,
+        )
+        print(f"Received all {len(results)} results at once")
+        for result in results:
+            print(f"Batch result for: {result.url} - Success: {result.success}")
+
+if __name__ == "__main__":
+    asyncio.run(test_crawler())
\ No newline at end of file
diff --git a/tests/20241401/test_stream_dispatch.py b/tests/20241401/test_stream_dispatch.py
new file mode 100644
index 00000000..0b5d004c
--- /dev/null
+++ b/tests/20241401/test_stream_dispatch.py
@@ -0,0 +1,39 @@
+import os, sys
+# append 2 parent directories to sys.path to import crawl4ai
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+parent_parent_dir = os.path.dirname(parent_dir)
+sys.path.append(parent_parent_dir)
+
+
+import asyncio
+from typing import List
+from crawl4ai import *
+from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
+
+async def test_streaming():
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            # content_filter=PruningContentFilter(
+            #     threshold=0.48, 
+            #     threshold_type="fixed", 
+            #     min_word_threshold=0
+            # )
+        ),
+    )
+
+    urls = ["http://example.com"] * 10
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            max_session_permit=5,
+            check_interval=0.5
+        )
+        
+        async for result in dispatcher.run_urls_stream(urls, crawler, crawler_config):
+            print(f"Got result for {result.url} - Success: {result.result.success}")
+
+if __name__ == "__main__":
+    asyncio.run(test_streaming())
\ No newline at end of file
diff --git a/tests/async/test_0.4.2_browser_manager.py b/tests/async/test_0.4.2_browser_manager.py
index 9bb19582..21b4be11 100644
--- a/tests/async/test_0.4.2_browser_manager.py
+++ b/tests/async/test_0.4.2_browser_manager.py
@@ -1,17 +1,18 @@
-import os, sys
-parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.append(parent_dir)
-__location__ = os.path.realpath(    os.path.join(os.getcwd(), os.path.dirname(__file__)))
-
-import os, sys
+import os
+import sys
 import asyncio
 from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 
-# Assuming that the changes made allow different configurations 
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+
+# Assuming that the changes made allow different configurations
 # for managed browser, persistent context, and so forth.
 
+
 async def test_default_headless():
     async with AsyncWebCrawler(
         headless=True,
@@ -24,13 +25,14 @@ async def test_default_headless():
         # Testing normal ephemeral context
     ) as crawler:
         result = await crawler.arun(
-            url='https://www.kidocode.com/degrees/technology',
+            url="https://www.kidocode.com/degrees/technology",
             cache_mode=CacheMode.BYPASS,
             markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
         )
         print("[test_default_headless] success:", result.success)
         print("HTML length:", len(result.html if result.html else ""))
-        
+
+
 async def test_managed_browser_persistent():
     # Treating use_persistent_context=True as managed_browser scenario.
     async with AsyncWebCrawler(
@@ -44,13 +46,14 @@ async def test_managed_browser_persistent():
         # This should store and reuse profile data across runs
     ) as crawler:
         result = await crawler.arun(
-            url='https://www.google.com',
+            url="https://www.google.com",
             cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
         )
         print("[test_managed_browser_persistent] success:", result.success)
         print("HTML length:", len(result.html if result.html else ""))
 
+
 async def test_session_reuse():
     # Test creating a session, using it for multiple calls
     session_id = "my_session"
@@ -62,25 +65,25 @@ async def test_session_reuse():
         use_managed_browser=False,
         use_persistent_context=False,
     ) as crawler:
-        
         # First call: create session
         result1 = await crawler.arun(
-            url='https://www.example.com',
+            url="https://www.example.com",
             cache_mode=CacheMode.BYPASS,
             session_id=session_id,
-            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
         )
         print("[test_session_reuse first call] success:", result1.success)
-        
+
         # Second call: same session, possibly cookie retained
         result2 = await crawler.arun(
-            url='https://www.example.com/about',
+            url="https://www.example.com/about",
             cache_mode=CacheMode.BYPASS,
             session_id=session_id,
-            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
         )
         print("[test_session_reuse second call] success:", result2.success)
 
+
 async def test_magic_mode():
     # Test magic mode with override_navigator and simulate_user
     async with AsyncWebCrawler(
@@ -95,13 +98,14 @@ async def test_magic_mode():
         simulate_user=True,
     ) as crawler:
         result = await crawler.arun(
-            url='https://www.kidocode.com/degrees/business',
+            url="https://www.kidocode.com/degrees/business",
             cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
         )
         print("[test_magic_mode] success:", result.success)
         print("HTML length:", len(result.html if result.html else ""))
 
+
 async def test_proxy_settings():
     # Test with a proxy (if available) to ensure code runs with proxy
     async with AsyncWebCrawler(
@@ -113,14 +117,15 @@ async def test_proxy_settings():
         use_persistent_context=False,
     ) as crawler:
         result = await crawler.arun(
-            url='https://httpbin.org/ip',
+            url="https://httpbin.org/ip",
             cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
         )
         print("[test_proxy_settings] success:", result.success)
         if result.success:
             print("HTML preview:", result.html[:200] if result.html else "")
 
+
 async def test_ignore_https_errors():
     # Test ignore HTTPS errors with a self-signed or invalid cert domain
     # This is just conceptual, the domain should be one that triggers SSL error.
@@ -134,12 +139,13 @@ async def test_ignore_https_errors():
         use_persistent_context=False,
     ) as crawler:
         result = await crawler.arun(
-            url='https://self-signed.badssl.com/',
+            url="https://self-signed.badssl.com/",
             cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
         )
         print("[test_ignore_https_errors] success:", result.success)
 
+
 async def main():
     print("Running tests...")
     # await test_default_headless()
@@ -149,5 +155,6 @@ async def main():
     # await test_proxy_settings()
     await test_ignore_https_errors()
 
+
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/tests/async/test_0.4.2_config_params.py b/tests/async/test_0.4.2_config_params.py
index 623ac3ab..9a15f864 100644
--- a/tests/async/test_0.4.2_config_params.py
+++ b/tests/async/test_0.4.2_config_params.py
@@ -1,15 +1,16 @@
 import os, sys
+
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 
 import asyncio
 from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig      
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 from crawl4ai.chunking_strategy import RegexChunking
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
 
 # Category 1: Browser Configuration Tests
 async def test_browser_config_object():
@@ -21,29 +22,31 @@ async def test_browser_config_object():
         viewport_height=1080,
         use_managed_browser=True,
         user_agent_mode="random",
-        user_agent_generator_config={"device_type": "desktop", "os_type": "windows"}
+        user_agent_generator_config={"device_type": "desktop", "os_type": "windows"},
     )
-    
+
     async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
-        result = await crawler.arun('https://example.com', cache_mode=CacheMode.BYPASS)
+        result = await crawler.arun("https://example.com", cache_mode=CacheMode.BYPASS)
         assert result.success, "Browser config crawl failed"
         assert len(result.html) > 0, "No HTML content retrieved"
 
+
 async def test_browser_performance_config():
     """Test browser configurations focused on performance"""
     browser_config = BrowserConfig(
         text_mode=True,
         light_mode=True,
-        extra_args=['--disable-gpu', '--disable-software-rasterizer'],
+        extra_args=["--disable-gpu", "--disable-software-rasterizer"],
         ignore_https_errors=True,
-        java_script_enabled=False
+        java_script_enabled=False,
     )
-    
+
     async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun('https://example.com')
+        result = await crawler.arun("https://example.com")
         assert result.success, "Performance optimized crawl failed"
         assert result.status_code == 200, "Unexpected status code"
 
+
 # Category 2: Content Processing Tests
 async def test_content_extraction_config():
     """Test content extraction with various strategies"""
@@ -53,24 +56,20 @@ async def test_content_extraction_config():
             schema={
                 "name": "article",
                 "baseSelector": "div",
-                "fields": [{
-                    "name": "title",
-                    "selector": "h1",
-                    "type": "text"
-                }]
+                "fields": [{"name": "title", "selector": "h1", "type": "text"}],
             }
         ),
         chunking_strategy=RegexChunking(),
-        content_filter=PruningContentFilter()
+        content_filter=PruningContentFilter(),
     )
-    
+
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(
-            'https://example.com/article',
-            config=crawler_config
+            "https://example.com/article", config=crawler_config
         )
         assert result.extracted_content is not None, "Content extraction failed"
-        assert 'title' in result.extracted_content, "Missing expected content field"
+        assert "title" in result.extracted_content, "Missing expected content field"
+
 
 # Category 3: Cache and Session Management Tests
 async def test_cache_and_session_management():
@@ -79,25 +78,20 @@ async def test_cache_and_session_management():
     crawler_config = CrawlerRunConfig(
         cache_mode=CacheMode.WRITE_ONLY,
         process_iframes=True,
-        remove_overlay_elements=True
+        remove_overlay_elements=True,
     )
-    
+
     async with AsyncWebCrawler(config=browser_config) as crawler:
         # First request - should write to cache
-        result1 = await crawler.arun(
-            'https://example.com',
-            config=crawler_config
-        )
-        
+        result1 = await crawler.arun("https://example.com", config=crawler_config)
+
         # Second request - should use fresh fetch due to WRITE_ONLY mode
-        result2 = await crawler.arun(
-            'https://example.com',
-            config=crawler_config
-        )
-        
+        result2 = await crawler.arun("https://example.com", config=crawler_config)
+
         assert result1.success and result2.success, "Cache mode crawl failed"
         assert result1.html == result2.html, "Inconsistent results between requests"
 
+
 # Category 4: Media Handling Tests
 async def test_media_handling_config():
     """Test configurations related to media handling"""
@@ -107,24 +101,22 @@ async def test_media_handling_config():
         viewport_width=1920,
         viewport_height=1080,
         accept_downloads=True,
-        downloads_path= os.path.expanduser("~/.crawl4ai/downloads")
+        downloads_path=os.path.expanduser("~/.crawl4ai/downloads"),
     )
     crawler_config = CrawlerRunConfig(
         screenshot=True,
         pdf=True,
         adjust_viewport_to_content=True,
         wait_for_images=True,
-        screenshot_height_threshold=20000
+        screenshot_height_threshold=20000,
     )
-    
+
     async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(
-            'https://example.com',
-            config=crawler_config
-        )
+        result = await crawler.arun("https://example.com", config=crawler_config)
         assert result.screenshot is not None, "Screenshot capture failed"
         assert result.pdf is not None, "PDF generation failed"
 
+
 # Category 5: Anti-Bot and Site Interaction Tests
 async def test_antibot_config():
     """Test configurations for handling anti-bot measures"""
@@ -135,76 +127,64 @@ async def test_antibot_config():
         wait_for="js:()=>document.querySelector('body')",
         delay_before_return_html=1.0,
         log_console=True,
-        cache_mode=CacheMode.BYPASS
+        cache_mode=CacheMode.BYPASS,
     )
-    
+
     async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            'https://example.com',
-            config=crawler_config
-        )
+        result = await crawler.arun("https://example.com", config=crawler_config)
         assert result.success, "Anti-bot measure handling failed"
 
+
 # Category 6: Parallel Processing Tests
 async def test_parallel_processing():
     """Test parallel processing capabilities"""
-    crawler_config = CrawlerRunConfig(
-        mean_delay=0.5,
-        max_range=1.0,
-        semaphore_count=5
-    )
-    
-    urls = [
-        'https://example.com/1',
-        'https://example.com/2',
-        'https://example.com/3'
-    ]
-    
+    crawler_config = CrawlerRunConfig(mean_delay=0.5, max_range=1.0, semaphore_count=5)
+
+    urls = ["https://example.com/1", "https://example.com/2", "https://example.com/3"]
+
     async with AsyncWebCrawler() as crawler:
-        results = await crawler.arun_many(
-            urls,
-            config=crawler_config
-        )
+        results = await crawler.arun_many(urls, config=crawler_config)
         assert len(results) == len(urls), "Not all URLs were processed"
         assert all(r.success for r in results), "Some parallel requests failed"
 
+
 # Category 7: Backwards Compatibility Tests
 async def test_legacy_parameter_support():
     """Test that legacy parameters still work"""
     async with AsyncWebCrawler(
-        headless=True,
-        browser_type="chromium",
-        viewport_width=1024,
-        viewport_height=768
+        headless=True, browser_type="chromium", viewport_width=1024, viewport_height=768
     ) as crawler:
         result = await crawler.arun(
-            'https://example.com',
+            "https://example.com",
             screenshot=True,
             word_count_threshold=200,
             bypass_cache=True,
-            css_selector=".main-content"
+            css_selector=".main-content",
         )
         assert result.success, "Legacy parameter support failed"
 
+
 # Category 8: Mixed Configuration Tests
 async def test_mixed_config_usage():
     """Test mixing new config objects with legacy parameters"""
     browser_config = BrowserConfig(headless=True)
     crawler_config = CrawlerRunConfig(screenshot=True)
-    
+
     async with AsyncWebCrawler(
         config=browser_config,
-        verbose=True  # legacy parameter
+        verbose=True,  # legacy parameter
     ) as crawler:
         result = await crawler.arun(
-            'https://example.com',
+            "https://example.com",
             config=crawler_config,
             cache_mode=CacheMode.BYPASS,  # legacy parameter
-            css_selector="body"  # legacy parameter
+            css_selector="body",  # legacy parameter
         )
         assert result.success, "Mixed configuration usage failed"
 
+
 if __name__ == "__main__":
+
     async def run_tests():
         test_functions = [
             test_browser_config_object,
@@ -217,7 +197,7 @@ if __name__ == "__main__":
             # test_legacy_parameter_support,
             # test_mixed_config_usage
         ]
-        
+
         for test in test_functions:
             print(f"\nRunning {test.__name__}...")
             try:
@@ -227,5 +207,5 @@ if __name__ == "__main__":
                 print(f"✗ {test.__name__} failed: {str(e)}")
             except Exception as e:
                 print(f"✗ {test.__name__} error: {str(e)}")
-    
-    asyncio.run(run_tests())
\ No newline at end of file
+
+    asyncio.run(run_tests())
diff --git a/tests/async/test_async_doanloader.py b/tests/async/test_async_doanloader.py
index 4798b4ca..055886cb 100644
--- a/tests/async/test_async_doanloader.py
+++ b/tests/async/test_async_doanloader.py
@@ -4,7 +4,6 @@ import asyncio
 import shutil
 from typing import List
 import tempfile
-import time
 
 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -12,28 +11,27 @@ sys.path.append(parent_dir)
 
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 
+
 class TestDownloads:
     def __init__(self):
         self.temp_dir = tempfile.mkdtemp(prefix="crawl4ai_test_")
         self.download_dir = os.path.join(self.temp_dir, "downloads")
         os.makedirs(self.download_dir, exist_ok=True)
         self.results: List[str] = []
-        
+
     def cleanup(self):
         shutil.rmtree(self.temp_dir)
-        
+
     def log_result(self, test_name: str, success: bool, message: str = ""):
         result = f"{'✅' if success else '❌'} {test_name}: {message}"
         self.results.append(result)
         print(result)
-        
+
     async def test_basic_download(self):
         """Test basic file download functionality"""
         try:
             async with AsyncWebCrawler(
-                accept_downloads=True,
-                downloads_path=self.download_dir,
-                verbose=True
+                accept_downloads=True, downloads_path=self.download_dir, verbose=True
             ) as crawler:
                 # Python.org downloads page typically has stable download links
                 result = await crawler.arun(
@@ -42,14 +40,19 @@ class TestDownloads:
                     // Click first download link
                     const downloadLink = document.querySelector('a[href$=".exe"]');
                     if (downloadLink) downloadLink.click();
-                    """
+                    """,
+                )
+
+                success = (
+                    result.downloaded_files is not None
+                    and len(result.downloaded_files) > 0
                 )
-                
-                success = result.downloaded_files is not None and len(result.downloaded_files) > 0
                 self.log_result(
                     "Basic Download",
                     success,
-                    f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded"
+                    f"Downloaded {len(result.downloaded_files or [])} files"
+                    if success
+                    else "No files downloaded",
                 )
         except Exception as e:
             self.log_result("Basic Download", False, str(e))
@@ -59,27 +62,32 @@ class TestDownloads:
         try:
             user_data_dir = os.path.join(self.temp_dir, "user_data")
             os.makedirs(user_data_dir, exist_ok=True)
-            
+
             async with AsyncWebCrawler(
                 accept_downloads=True,
                 downloads_path=self.download_dir,
                 use_persistent_context=True,
                 user_data_dir=user_data_dir,
-                verbose=True
+                verbose=True,
             ) as crawler:
                 result = await crawler.arun(
                     url="https://www.python.org/downloads/",
                     js_code="""
                     const downloadLink = document.querySelector('a[href$=".exe"]');
                     if (downloadLink) downloadLink.click();
-                    """
+                    """,
+                )
+
+                success = (
+                    result.downloaded_files is not None
+                    and len(result.downloaded_files) > 0
                 )
-                
-                success = result.downloaded_files is not None and len(result.downloaded_files) > 0
                 self.log_result(
                     "Persistent Context Download",
                     success,
-                    f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded"
+                    f"Downloaded {len(result.downloaded_files or [])} files"
+                    if success
+                    else "No files downloaded",
                 )
         except Exception as e:
             self.log_result("Persistent Context Download", False, str(e))
@@ -88,9 +96,7 @@ class TestDownloads:
         """Test multiple simultaneous downloads"""
         try:
             async with AsyncWebCrawler(
-                accept_downloads=True,
-                downloads_path=self.download_dir,
-                verbose=True
+                accept_downloads=True, downloads_path=self.download_dir, verbose=True
             ) as crawler:
                 result = await crawler.arun(
                     url="https://www.python.org/downloads/",
@@ -98,14 +104,19 @@ class TestDownloads:
                     // Click multiple download links
                     const downloadLinks = document.querySelectorAll('a[href$=".exe"]');
                     downloadLinks.forEach(link => link.click());
-                    """
+                    """,
+                )
+
+                success = (
+                    result.downloaded_files is not None
+                    and len(result.downloaded_files) > 1
                 )
-                
-                success = result.downloaded_files is not None and len(result.downloaded_files) > 1
                 self.log_result(
                     "Multiple Downloads",
                     success,
-                    f"Downloaded {len(result.downloaded_files or [])} files" if success else "Not enough files downloaded"
+                    f"Downloaded {len(result.downloaded_files or [])} files"
+                    if success
+                    else "Not enough files downloaded",
                 )
         except Exception as e:
             self.log_result("Multiple Downloads", False, str(e))
@@ -113,49 +124,51 @@ class TestDownloads:
     async def test_different_browsers(self):
         """Test downloads across different browser types"""
         browsers = ["chromium", "firefox", "webkit"]
-        
+
         for browser_type in browsers:
             try:
                 async with AsyncWebCrawler(
                     accept_downloads=True,
                     downloads_path=self.download_dir,
                     browser_type=browser_type,
-                    verbose=True
+                    verbose=True,
                 ) as crawler:
                     result = await crawler.arun(
                         url="https://www.python.org/downloads/",
                         js_code="""
                         const downloadLink = document.querySelector('a[href$=".exe"]');
                         if (downloadLink) downloadLink.click();
-                        """
+                        """,
+                    )
+
+                    success = (
+                        result.downloaded_files is not None
+                        and len(result.downloaded_files) > 0
                     )
-                    
-                    success = result.downloaded_files is not None and len(result.downloaded_files) > 0
                     self.log_result(
                         f"{browser_type.title()} Download",
                         success,
-                        f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded"
+                        f"Downloaded {len(result.downloaded_files or [])} files"
+                        if success
+                        else "No files downloaded",
                     )
             except Exception as e:
                 self.log_result(f"{browser_type.title()} Download", False, str(e))
 
     async def test_edge_cases(self):
         """Test various edge cases"""
-        
+
         # Test 1: Downloads without specifying download path
         try:
-            async with AsyncWebCrawler(
-                accept_downloads=True,
-                verbose=True
-            ) as crawler:
+            async with AsyncWebCrawler(accept_downloads=True, verbose=True) as crawler:
                 result = await crawler.arun(
                     url="https://www.python.org/downloads/",
-                    js_code="document.querySelector('a[href$=\".exe\"]').click()"
+                    js_code="document.querySelector('a[href$=\".exe\"]').click()",
                 )
                 self.log_result(
                     "Default Download Path",
                     True,
-                    f"Downloaded to default path: {result.downloaded_files[0] if result.downloaded_files else 'None'}"
+                    f"Downloaded to default path: {result.downloaded_files[0] if result.downloaded_files else 'None'}",
                 )
         except Exception as e:
             self.log_result("Default Download Path", False, str(e))
@@ -165,31 +178,34 @@ class TestDownloads:
             async with AsyncWebCrawler(
                 accept_downloads=True,
                 downloads_path="/invalid/path/that/doesnt/exist",
-                verbose=True
+                verbose=True,
             ) as crawler:
                 result = await crawler.arun(
                     url="https://www.python.org/downloads/",
-                    js_code="document.querySelector('a[href$=\".exe\"]').click()"
+                    js_code="document.querySelector('a[href$=\".exe\"]').click()",
                 )
-                self.log_result("Invalid Download Path", False, "Should have raised an error")
-        except Exception as e:
-            self.log_result("Invalid Download Path", True, "Correctly handled invalid path")
+                self.log_result(
+                    "Invalid Download Path", False, "Should have raised an error"
+                )
+        except Exception:
+            self.log_result(
+                "Invalid Download Path", True, "Correctly handled invalid path"
+            )
 
         # Test 3: Download with accept_downloads=False
         try:
-            async with AsyncWebCrawler(
-                accept_downloads=False,
-                verbose=True
-            ) as crawler:
+            async with AsyncWebCrawler(accept_downloads=False, verbose=True) as crawler:
                 result = await crawler.arun(
                     url="https://www.python.org/downloads/",
-                    js_code="document.querySelector('a[href$=\".exe\"]').click()"
+                    js_code="document.querySelector('a[href$=\".exe\"]').click()",
                 )
                 success = result.downloaded_files is None
                 self.log_result(
                     "Disabled Downloads",
                     success,
-                    "Correctly ignored downloads" if success else "Unexpectedly downloaded files"
+                    "Correctly ignored downloads"
+                    if success
+                    else "Unexpectedly downloaded files",
                 )
         except Exception as e:
             self.log_result("Disabled Downloads", False, str(e))
@@ -197,33 +213,35 @@ class TestDownloads:
     async def run_all_tests(self):
         """Run all test cases"""
         print("\n🧪 Running Download Tests...\n")
-        
+
         test_methods = [
             self.test_basic_download,
             self.test_persistent_context_download,
             self.test_multiple_downloads,
             self.test_different_browsers,
-            self.test_edge_cases
+            self.test_edge_cases,
         ]
-        
+
         for test in test_methods:
             print(f"\n📝 Running {test.__doc__}...")
             await test()
             await asyncio.sleep(2)  # Brief pause between tests
-            
+
         print("\n📊 Test Results Summary:")
         for result in self.results:
             print(result)
-            
-        successes = len([r for r in self.results if '✅' in r])
+
+        successes = len([r for r in self.results if "✅" in r])
         total = len(self.results)
         print(f"\nTotal: {successes}/{total} tests passed")
-        
+
         self.cleanup()
 
+
 async def main():
     tester = TestDownloads()
     await tester.run_all_tests()
 
+
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/tests/async/test_basic_crawling.py b/tests/async/test_basic_crawling.py
index ce38ac2f..ee4bb633 100644
--- a/tests/async/test_basic_crawling.py
+++ b/tests/async/test_basic_crawling.py
@@ -1,15 +1,17 @@
 import os
 import sys
 import pytest
-import asyncio
 import time
 
 # Add the parent directory to the Python path
-parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+parent_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
 sys.path.append(parent_dir)
 
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 
+
 @pytest.mark.asyncio
 async def test_successful_crawl():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -21,6 +23,7 @@ async def test_successful_crawl():
         assert result.markdown
         assert result.cleaned_html
 
+
 @pytest.mark.asyncio
 async def test_invalid_url():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -29,19 +32,21 @@ async def test_invalid_url():
         assert not result.success
         assert result.error_message
 
+
 @pytest.mark.asyncio
 async def test_multiple_urls():
     async with AsyncWebCrawler(verbose=True) as crawler:
         urls = [
             "https://www.nbcnews.com/business",
             "https://www.example.com",
-            "https://www.python.org"
+            "https://www.python.org",
         ]
         results = await crawler.arun_many(urls=urls, bypass_cache=True)
         assert len(results) == len(urls)
         assert all(result.success for result in results)
         assert all(result.html for result in results)
 
+
 @pytest.mark.asyncio
 async def test_javascript_execution():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -51,6 +56,7 @@ async def test_javascript_execution():
         assert result.success
         assert "<h1>Modified by JS</h1>" in result.html
 
+
 @pytest.mark.asyncio
 async def test_concurrent_crawling_performance():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -59,23 +65,26 @@ async def test_concurrent_crawling_performance():
             "https://www.example.com",
             "https://www.python.org",
             "https://www.github.com",
-            "https://www.stackoverflow.com"
+            "https://www.stackoverflow.com",
         ]
-        
+
         start_time = time.time()
         results = await crawler.arun_many(urls=urls, bypass_cache=True)
         end_time = time.time()
-        
+
         total_time = end_time - start_time
         print(f"Total time for concurrent crawling: {total_time:.2f} seconds")
-        
+
         assert all(result.success for result in results)
         assert len(results) == len(urls)
-        
+
         # Assert that concurrent crawling is faster than sequential
         # This multiplier may need adjustment based on the number of URLs and their complexity
-        assert total_time < len(urls) * 5, f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
+        assert (
+            total_time < len(urls) * 5
+        ), f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
+
 
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
\ No newline at end of file
+    pytest.main([__file__, "-v"])
diff --git a/tests/async/test_caching.py b/tests/async/test_caching.py
index 589beca9..d7f6efb5 100644
--- a/tests/async/test_caching.py
+++ b/tests/async/test_caching.py
@@ -9,74 +9,79 @@ sys.path.append(parent_dir)
 
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 
+
 @pytest.mark.asyncio
 async def test_caching():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
-        
+
         # First crawl (should not use cache)
         start_time = asyncio.get_event_loop().time()
         result1 = await crawler.arun(url=url, bypass_cache=True)
         end_time = asyncio.get_event_loop().time()
         time_taken1 = end_time - start_time
-        
+
         assert result1.success
-        
+
         # Second crawl (should use cache)
         start_time = asyncio.get_event_loop().time()
         result2 = await crawler.arun(url=url, bypass_cache=False)
         end_time = asyncio.get_event_loop().time()
         time_taken2 = end_time - start_time
-        
+
         assert result2.success
         assert time_taken2 < time_taken1  # Cached result should be faster
 
+
 @pytest.mark.asyncio
 async def test_bypass_cache():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
-        
+
         # First crawl
         result1 = await crawler.arun(url=url, bypass_cache=False)
         assert result1.success
-        
+
         # Second crawl with bypass_cache=True
         result2 = await crawler.arun(url=url, bypass_cache=True)
         assert result2.success
-        
+
         # Content should be different (or at least, not guaranteed to be the same)
         assert result1.html != result2.html or result1.markdown != result2.markdown
 
+
 @pytest.mark.asyncio
 async def test_clear_cache():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
-        
+
         # Crawl and cache
         await crawler.arun(url=url, bypass_cache=False)
-        
+
         # Clear cache
         await crawler.aclear_cache()
-        
+
         # Check cache size
         cache_size = await crawler.aget_cache_size()
         assert cache_size == 0
 
+
 @pytest.mark.asyncio
 async def test_flush_cache():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
-        
+
         # Crawl and cache
         await crawler.arun(url=url, bypass_cache=False)
-        
+
         # Flush cache
         await crawler.aflush_cache()
-        
+
         # Check cache size
         cache_size = await crawler.aget_cache_size()
         assert cache_size == 0
 
+
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
\ No newline at end of file
+    pytest.main([__file__, "-v"])
diff --git a/tests/async/test_chunking_and_extraction_strategies.py b/tests/async/test_chunking_and_extraction_strategies.py
index af1c9fbd..ab9daddc 100644
--- a/tests/async/test_chunking_and_extraction_strategies.py
+++ b/tests/async/test_chunking_and_extraction_strategies.py
@@ -1,7 +1,6 @@
 import os
 import sys
 import pytest
-import asyncio
 import json
 
 # Add the parent directory to the Python path
@@ -9,8 +8,9 @@ parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
 
 from crawl4ai.async_webcrawler import AsyncWebCrawler
-from crawl4ai.chunking_strategy import RegexChunking, NlpSentenceChunking
-from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy
+from crawl4ai.chunking_strategy import RegexChunking
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
 
 @pytest.mark.asyncio
 async def test_regex_chunking():
@@ -18,15 +18,14 @@ async def test_regex_chunking():
         url = "https://www.nbcnews.com/business"
         chunking_strategy = RegexChunking(patterns=["\n\n"])
         result = await crawler.arun(
-            url=url,
-            chunking_strategy=chunking_strategy,
-            bypass_cache=True
+            url=url, chunking_strategy=chunking_strategy, bypass_cache=True
         )
         assert result.success
         assert result.extracted_content
         chunks = json.loads(result.extracted_content)
         assert len(chunks) > 1  # Ensure multiple chunks were created
 
+
 # @pytest.mark.asyncio
 # async def test_cosine_strategy():
 #     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -43,25 +42,25 @@ async def test_regex_chunking():
 #         assert len(extracted_data) > 0
 #         assert all('tags' in item for item in extracted_data)
 
+
 @pytest.mark.asyncio
 async def test_llm_extraction_strategy():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
         extraction_strategy = LLMExtractionStrategy(
             provider="openai/gpt-4o-mini",
-            api_token=os.getenv('OPENAI_API_KEY'),
-            instruction="Extract only content related to technology"
+            api_token=os.getenv("OPENAI_API_KEY"),
+            instruction="Extract only content related to technology",
         )
         result = await crawler.arun(
-            url=url,
-            extraction_strategy=extraction_strategy,
-            bypass_cache=True
+            url=url, extraction_strategy=extraction_strategy, bypass_cache=True
         )
         assert result.success
         assert result.extracted_content
         extracted_data = json.loads(result.extracted_content)
         assert len(extracted_data) > 0
-        assert all('content' in item for item in extracted_data)
+        assert all("content" in item for item in extracted_data)
+
 
 # @pytest.mark.asyncio
 # async def test_combined_chunking_and_extraction():
@@ -84,4 +83,4 @@ async def test_llm_extraction_strategy():
 
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
\ No newline at end of file
+    pytest.main([__file__, "-v"])
diff --git a/tests/async/test_content_extraction.py b/tests/async/test_content_extraction.py
index 7604db20..9372387a 100644
--- a/tests/async/test_content_extraction.py
+++ b/tests/async/test_content_extraction.py
@@ -1,8 +1,6 @@
 import os
 import sys
 import pytest
-import asyncio
-import json
 
 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -10,6 +8,7 @@ sys.path.append(parent_dir)
 
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 
+
 @pytest.mark.asyncio
 async def test_extract_markdown():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -20,6 +19,7 @@ async def test_extract_markdown():
         assert isinstance(result.markdown, str)
         assert len(result.markdown) > 0
 
+
 @pytest.mark.asyncio
 async def test_extract_cleaned_html():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -30,6 +30,7 @@ async def test_extract_cleaned_html():
         assert isinstance(result.cleaned_html, str)
         assert len(result.cleaned_html) > 0
 
+
 @pytest.mark.asyncio
 async def test_extract_media():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -46,6 +47,7 @@ async def test_extract_media():
             assert "alt" in image
             assert "type" in image
 
+
 @pytest.mark.asyncio
 async def test_extract_links():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -63,6 +65,7 @@ async def test_extract_links():
             assert "href" in link
             assert "text" in link
 
+
 @pytest.mark.asyncio
 async def test_extract_metadata():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -75,16 +78,20 @@ async def test_extract_metadata():
         assert "title" in metadata
         assert isinstance(metadata["title"], str)
 
+
 @pytest.mark.asyncio
 async def test_css_selector_extraction():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
         css_selector = "h1, h2, h3"
-        result = await crawler.arun(url=url, bypass_cache=True, css_selector=css_selector)
+        result = await crawler.arun(
+            url=url, bypass_cache=True, css_selector=css_selector
+        )
         assert result.success
         assert result.markdown
         assert all(heading in result.markdown for heading in ["#", "##", "###"])
 
+
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
\ No newline at end of file
+    pytest.main([__file__, "-v"])
diff --git a/tests/async/test_content_filter_bm25.py b/tests/async/test_content_filter_bm25.py
index a873c414..f05a8af7 100644
--- a/tests/async/test_content_filter_bm25.py
+++ b/tests/async/test_content_filter_bm25.py
@@ -1,7 +1,6 @@
 import os, sys
 import pytest
 from bs4 import BeautifulSoup
-from typing import List
 
 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -9,6 +8,7 @@ sys.path.append(parent_dir)
 
 from crawl4ai.content_filter_strategy import BM25ContentFilter
 
+
 @pytest.fixture
 def basic_html():
     return """
@@ -28,6 +28,7 @@ def basic_html():
     </html>
     """
 
+
 @pytest.fixture
 def wiki_html():
     return """
@@ -46,6 +47,7 @@ def wiki_html():
     </html>
     """
 
+
 @pytest.fixture
 def no_meta_html():
     return """
@@ -57,26 +59,27 @@ def no_meta_html():
     </html>
     """
 
+
 class TestBM25ContentFilter:
     def test_basic_extraction(self, basic_html):
         """Test basic content extraction functionality"""
         filter = BM25ContentFilter()
         contents = filter.filter_content(basic_html)
-        
+
         assert contents, "Should extract content"
         assert len(contents) >= 1, "Should extract at least one content block"
-        assert "long paragraph" in ' '.join(contents).lower()
-        assert "navigation" not in ' '.join(contents).lower()
+        assert "long paragraph" in " ".join(contents).lower()
+        assert "navigation" not in " ".join(contents).lower()
 
     def test_user_query_override(self, basic_html):
         """Test that user query overrides metadata extraction"""
         user_query = "specific test query"
         filter = BM25ContentFilter(user_query=user_query)
-        
+
         # Access internal state to verify query usage
-        soup = BeautifulSoup(basic_html, 'lxml')
-        extracted_query = filter.extract_page_query(soup.find('head'))
-        
+        soup = BeautifulSoup(basic_html, "lxml")
+        extracted_query = filter.extract_page_query(soup.find("head"))
+
         assert extracted_query == user_query
         assert "Test description" not in extracted_query
 
@@ -84,8 +87,8 @@ class TestBM25ContentFilter:
         """Test that headers are properly extracted despite length"""
         filter = BM25ContentFilter()
         contents = filter.filter_content(wiki_html)
-        
-        combined_content = ' '.join(contents).lower()
+
+        combined_content = " ".join(contents).lower()
         assert "section 1" in combined_content, "Should include section header"
         assert "article title" in combined_content, "Should include main title"
 
@@ -93,9 +96,11 @@ class TestBM25ContentFilter:
         """Test fallback behavior when no metadata is present"""
         filter = BM25ContentFilter()
         contents = filter.filter_content(no_meta_html)
-        
+
         assert contents, "Should extract content even without metadata"
-        assert "First paragraph" in ' '.join(contents), "Should use first paragraph content"
+        assert "First paragraph" in " ".join(
+            contents
+        ), "Should use first paragraph content"
 
     def test_empty_input(self):
         """Test handling of empty input"""
@@ -108,29 +113,30 @@ class TestBM25ContentFilter:
         malformed_html = "<p>Unclosed paragraph<div>Nested content</p></div>"
         filter = BM25ContentFilter()
         contents = filter.filter_content(malformed_html)
-        
+
         assert isinstance(contents, list), "Should return list even with malformed HTML"
-        
+
     def test_threshold_behavior(self, basic_html):
         """Test different BM25 threshold values"""
         strict_filter = BM25ContentFilter(bm25_threshold=2.0)
         lenient_filter = BM25ContentFilter(bm25_threshold=0.5)
-        
+
         strict_contents = strict_filter.filter_content(basic_html)
         lenient_contents = lenient_filter.filter_content(basic_html)
-        
-        assert len(strict_contents) <= len(lenient_contents), \
-            "Strict threshold should extract fewer elements"
+
+        assert len(strict_contents) <= len(
+            lenient_contents
+        ), "Strict threshold should extract fewer elements"
 
     def test_html_cleaning(self, basic_html):
         """Test HTML cleaning functionality"""
         filter = BM25ContentFilter()
         contents = filter.filter_content(basic_html)
-        
-        cleaned_content = ' '.join(contents)
-        assert 'class=' not in cleaned_content, "Should remove class attributes"
-        assert 'style=' not in cleaned_content, "Should remove style attributes"
-        assert '<script' not in cleaned_content, "Should remove script tags"
+
+        cleaned_content = " ".join(contents)
+        assert "class=" not in cleaned_content, "Should remove class attributes"
+        assert "style=" not in cleaned_content, "Should remove style attributes"
+        assert "<script" not in cleaned_content, "Should remove script tags"
 
     def test_large_content(self):
         """Test handling of large content blocks"""
@@ -143,9 +149,9 @@ class TestBM25ContentFilter:
         contents = filter.filter_content(large_html)
         assert contents, "Should handle large content blocks"
 
-    @pytest.mark.parametrize("unwanted_tag", [
-        'script', 'style', 'nav', 'footer', 'header'
-    ])
+    @pytest.mark.parametrize(
+        "unwanted_tag", ["script", "style", "nav", "footer", "header"]
+    )
     def test_excluded_tags(self, unwanted_tag):
         """Test that specific tags are properly excluded"""
         html = f"""
@@ -156,20 +162,22 @@ class TestBM25ContentFilter:
         """
         filter = BM25ContentFilter()
         contents = filter.filter_content(html)
-        
-        combined_content = ' '.join(contents).lower()
+
+        combined_content = " ".join(contents).lower()
         assert "should not appear" not in combined_content
-        
+
     def test_performance(self, basic_html):
         """Test performance with timer"""
         filter = BM25ContentFilter()
-        
+
         import time
+
         start = time.perf_counter()
         filter.filter_content(basic_html)
         duration = time.perf_counter() - start
-        
+
         assert duration < 1.0, f"Processing took too long: {duration:.2f} seconds"
 
+
 if __name__ == "__main__":
-    pytest.main([__file__])
\ No newline at end of file
+    pytest.main([__file__])
diff --git a/tests/async/test_content_filter_prune.py b/tests/async/test_content_filter_prune.py
index 23b0fa3a..1f75a9e1 100644
--- a/tests/async/test_content_filter_prune.py
+++ b/tests/async/test_content_filter_prune.py
@@ -1,12 +1,12 @@
 import os, sys
 import pytest
-from bs4 import BeautifulSoup
 
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
 
 from crawl4ai.content_filter_strategy import PruningContentFilter
 
+
 @pytest.fixture
 def basic_html():
     return """
@@ -22,6 +22,7 @@ def basic_html():
     </html>
     """
 
+
 @pytest.fixture
 def link_heavy_html():
     return """
@@ -40,6 +41,7 @@ def link_heavy_html():
     </html>
     """
 
+
 @pytest.fixture
 def mixed_content_html():
     return """
@@ -60,13 +62,14 @@ def mixed_content_html():
     </html>
     """
 
+
 class TestPruningContentFilter:
     def test_basic_pruning(self, basic_html):
         """Test basic content pruning functionality"""
         filter = PruningContentFilter(min_word_threshold=5)
         contents = filter.filter_content(basic_html)
-        
-        combined_content = ' '.join(contents).lower()
+
+        combined_content = " ".join(contents).lower()
         assert "high-quality paragraph" in combined_content
         assert "sidebar content" not in combined_content
         assert "share buttons" not in combined_content
@@ -75,40 +78,42 @@ class TestPruningContentFilter:
         """Test minimum word threshold filtering"""
         filter = PruningContentFilter(min_word_threshold=10)
         contents = filter.filter_content(mixed_content_html)
-        
-        combined_content = ' '.join(contents).lower()
+
+        combined_content = " ".join(contents).lower()
         assert "short summary" not in combined_content
         assert "long high-quality paragraph" in combined_content
         assert "short comment" not in combined_content
 
     def test_threshold_types(self, basic_html):
         """Test fixed vs dynamic thresholds"""
-        fixed_filter = PruningContentFilter(threshold_type='fixed', threshold=0.48)
-        dynamic_filter = PruningContentFilter(threshold_type='dynamic', threshold=0.45)
-        
+        fixed_filter = PruningContentFilter(threshold_type="fixed", threshold=0.48)
+        dynamic_filter = PruningContentFilter(threshold_type="dynamic", threshold=0.45)
+
         fixed_contents = fixed_filter.filter_content(basic_html)
         dynamic_contents = dynamic_filter.filter_content(basic_html)
-        
-        assert len(fixed_contents) != len(dynamic_contents), \
-            "Fixed and dynamic thresholds should yield different results"
+
+        assert len(fixed_contents) != len(
+            dynamic_contents
+        ), "Fixed and dynamic thresholds should yield different results"
 
     def test_link_density_impact(self, link_heavy_html):
         """Test handling of link-heavy content"""
-        filter = PruningContentFilter(threshold_type='dynamic')
+        filter = PruningContentFilter(threshold_type="dynamic")
         contents = filter.filter_content(link_heavy_html)
-        
-        combined_content = ' '.join(contents).lower()
+
+        combined_content = " ".join(contents).lower()
         assert "good content paragraph" in combined_content
-        assert len([c for c in contents if 'href' in c]) < 2, \
-            "Should prune link-heavy sections"
+        assert (
+            len([c for c in contents if "href" in c]) < 2
+        ), "Should prune link-heavy sections"
 
     def test_tag_importance(self, mixed_content_html):
         """Test tag importance in scoring"""
-        filter = PruningContentFilter(threshold_type='dynamic')
+        filter = PruningContentFilter(threshold_type="dynamic")
         contents = filter.filter_content(mixed_content_html)
-        
-        has_article = any('article' in c.lower() for c in contents)
-        has_h1 = any('h1' in c.lower() for c in contents)
+
+        has_article = any("article" in c.lower() for c in contents)
+        has_h1 = any("h1" in c.lower() for c in contents)
         assert has_article or has_h1, "Should retain important tags"
 
     def test_empty_input(self):
@@ -127,26 +132,31 @@ class TestPruningContentFilter:
     def test_performance(self, basic_html):
         """Test performance with timer"""
         filter = PruningContentFilter()
-        
+
         import time
+
         start = time.perf_counter()
         filter.filter_content(basic_html)
         duration = time.perf_counter() - start
-        
+
         # Extra strict on performance since you mentioned milliseconds matter
         assert duration < 0.1, f"Processing took too long: {duration:.3f} seconds"
 
-    @pytest.mark.parametrize("threshold,expected_count", [
-        (0.3, 4),  # Very lenient
-        (0.48, 2), # Default
-        (0.7, 1),  # Very strict
-    ])
+    @pytest.mark.parametrize(
+        "threshold,expected_count",
+        [
+            (0.3, 4),  # Very lenient
+            (0.48, 2),  # Default
+            (0.7, 1),  # Very strict
+        ],
+    )
     def test_threshold_levels(self, mixed_content_html, threshold, expected_count):
         """Test different threshold levels"""
-        filter = PruningContentFilter(threshold_type='fixed', threshold=threshold)
+        filter = PruningContentFilter(threshold_type="fixed", threshold=threshold)
         contents = filter.filter_content(mixed_content_html)
-        assert len(contents) <= expected_count, \
-            f"Expected {expected_count} or fewer elements with threshold {threshold}"
+        assert (
+            len(contents) <= expected_count
+        ), f"Expected {expected_count} or fewer elements with threshold {threshold}"
 
     def test_consistent_output(self, basic_html):
         """Test output consistency across multiple runs"""
@@ -155,5 +165,6 @@ class TestPruningContentFilter:
         second_run = filter.filter_content(basic_html)
         assert first_run == second_run, "Output should be consistent"
 
+
 if __name__ == "__main__":
-    pytest.main([__file__])
\ No newline at end of file
+    pytest.main([__file__])
diff --git a/tests/async/test_content_scraper_strategy.py b/tests/async/test_content_scraper_strategy.py
index 62c49148..e6caf240 100644
--- a/tests/async/test_content_scraper_strategy.py
+++ b/tests/async/test_content_scraper_strategy.py
@@ -1,22 +1,24 @@
-import asyncio
-from bs4 import BeautifulSoup
-from typing import Dict, Any
 import os
 import sys
 import time
 import csv
 from tabulate import tabulate
 from dataclasses import dataclass
-from typing import List, Dict
+from typing import List
 
-parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+parent_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
 sys.path.append(parent_dir)
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 
 from crawl4ai.content_scraping_strategy import WebScrapingStrategy
-from crawl4ai.content_scraping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent
+from crawl4ai.content_scraping_strategy import (
+    WebScrapingStrategy as WebScrapingStrategyCurrent,
+)
 # from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent
 
+
 @dataclass
 class TestResult:
     name: str
@@ -27,69 +29,71 @@ class TestResult:
     markdown_length: int
     execution_time: float
 
+
 class StrategyTester:
     def __init__(self):
         self.new_scraper = WebScrapingStrategy()
         self.current_scraper = WebScrapingStrategyCurrent()
-        with open(__location__ + '/sample_wikipedia.html', 'r', encoding='utf-8') as f:
+        with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f:
             self.WIKI_HTML = f.read()
-        self.results = {'new': [], 'current': []}
-        
+        self.results = {"new": [], "current": []}
+
     def run_test(self, name: str, **kwargs) -> tuple[TestResult, TestResult]:
         results = []
         for scraper in [self.new_scraper, self.current_scraper]:
             start_time = time.time()
             result = scraper._get_content_of_website_optimized(
-                url="https://en.wikipedia.org/wiki/Test",
-                html=self.WIKI_HTML,
-                **kwargs
+                url="https://en.wikipedia.org/wiki/Test", html=self.WIKI_HTML, **kwargs
             )
             execution_time = time.time() - start_time
-            
+
             test_result = TestResult(
                 name=name,
-                success=result['success'],
-                images=len(result['media']['images']),
-                internal_links=len(result['links']['internal']),
-                external_links=len(result['links']['external']),
-                markdown_length=len(result['markdown']),
-                execution_time=execution_time
+                success=result["success"],
+                images=len(result["media"]["images"]),
+                internal_links=len(result["links"]["internal"]),
+                external_links=len(result["links"]["external"]),
+                markdown_length=len(result["markdown"]),
+                execution_time=execution_time,
             )
             results.append(test_result)
-        
+
         return results[0], results[1]  # new, current
 
     def run_all_tests(self):
         test_cases = [
             ("Basic Extraction", {}),
-            ("Exclude Tags", {'excluded_tags': ['table', 'div.infobox', 'div.navbox']}),
-            ("Word Threshold", {'word_count_threshold': 50}),
-            ("CSS Selector", {'css_selector': 'div.mw-parser-output > p'}),
-            ("Link Exclusions", {
-                'exclude_external_links': True,
-                'exclude_social_media_links': True,
-                'exclude_domains': ['facebook.com', 'twitter.com']
-            }),
-            ("Media Handling", {
-                'exclude_external_images': True,
-                'image_description_min_word_threshold': 20
-            }),
-            ("Text Only", {
-                'only_text': True,
-                'remove_forms': True
-            }),
-            ("HTML Cleaning", {
-                'clean_html': True,
-                'keep_data_attributes': True
-            }),
-            ("HTML2Text Options", {
-                'html2text': {
-                    'skip_internal_links': True,
-                    'single_line_break': True,
-                    'mark_code': True,
-                    'preserve_tags': ['pre', 'code']
-                }
-            })
+            ("Exclude Tags", {"excluded_tags": ["table", "div.infobox", "div.navbox"]}),
+            ("Word Threshold", {"word_count_threshold": 50}),
+            ("CSS Selector", {"css_selector": "div.mw-parser-output > p"}),
+            (
+                "Link Exclusions",
+                {
+                    "exclude_external_links": True,
+                    "exclude_social_media_links": True,
+                    "exclude_domains": ["facebook.com", "twitter.com"],
+                },
+            ),
+            (
+                "Media Handling",
+                {
+                    "exclude_external_images": True,
+                    "image_description_min_word_threshold": 20,
+                },
+            ),
+            ("Text Only", {"only_text": True, "remove_forms": True}),
+            ("HTML Cleaning", {"clean_html": True, "keep_data_attributes": True}),
+            (
+                "HTML2Text Options",
+                {
+                    "html2text": {
+                        "skip_internal_links": True,
+                        "single_line_break": True,
+                        "mark_code": True,
+                        "preserve_tags": ["pre", "code"],
+                    }
+                },
+            ),
         ]
 
         all_results = []
@@ -99,64 +103,117 @@ class StrategyTester:
                 all_results.append((name, new_result, current_result))
             except Exception as e:
                 print(f"Error in {name}: {str(e)}")
-                
+
         self.save_results_to_csv(all_results)
         self.print_comparison_table(all_results)
 
     def save_results_to_csv(self, all_results: List[tuple]):
-        csv_file = os.path.join(__location__, 'strategy_comparison_results.csv')
-        with open(csv_file, 'w', newline='') as f:
+        csv_file = os.path.join(__location__, "strategy_comparison_results.csv")
+        with open(csv_file, "w", newline="") as f:
             writer = csv.writer(f)
-            writer.writerow(['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links', 
-                           'External Links', 'Markdown Length', 'Execution Time'])
-            
+            writer.writerow(
+                [
+                    "Test Name",
+                    "Strategy",
+                    "Success",
+                    "Images",
+                    "Internal Links",
+                    "External Links",
+                    "Markdown Length",
+                    "Execution Time",
+                ]
+            )
+
             for name, new_result, current_result in all_results:
-                writer.writerow([name, 'New', new_result.success, new_result.images,
-                               new_result.internal_links, new_result.external_links,
-                               new_result.markdown_length, f"{new_result.execution_time:.3f}"])
-                writer.writerow([name, 'Current', current_result.success, current_result.images,
-                               current_result.internal_links, current_result.external_links,
-                               current_result.markdown_length, f"{current_result.execution_time:.3f}"])
+                writer.writerow(
+                    [
+                        name,
+                        "New",
+                        new_result.success,
+                        new_result.images,
+                        new_result.internal_links,
+                        new_result.external_links,
+                        new_result.markdown_length,
+                        f"{new_result.execution_time:.3f}",
+                    ]
+                )
+                writer.writerow(
+                    [
+                        name,
+                        "Current",
+                        current_result.success,
+                        current_result.images,
+                        current_result.internal_links,
+                        current_result.external_links,
+                        current_result.markdown_length,
+                        f"{current_result.execution_time:.3f}",
+                    ]
+                )
 
     def print_comparison_table(self, all_results: List[tuple]):
         table_data = []
-        headers = ['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links', 
-                  'External Links', 'Markdown Length', 'Time (s)']
+        headers = [
+            "Test Name",
+            "Strategy",
+            "Success",
+            "Images",
+            "Internal Links",
+            "External Links",
+            "Markdown Length",
+            "Time (s)",
+        ]
 
         for name, new_result, current_result in all_results:
             # Check for differences
             differences = []
-            if new_result.images != current_result.images: differences.append('images')
-            if new_result.internal_links != current_result.internal_links: differences.append('internal_links')
-            if new_result.external_links != current_result.external_links: differences.append('external_links')
-            if new_result.markdown_length != current_result.markdown_length: differences.append('markdown')
-            
+            if new_result.images != current_result.images:
+                differences.append("images")
+            if new_result.internal_links != current_result.internal_links:
+                differences.append("internal_links")
+            if new_result.external_links != current_result.external_links:
+                differences.append("external_links")
+            if new_result.markdown_length != current_result.markdown_length:
+                differences.append("markdown")
+
             # Add row for new strategy
             new_row = [
-                name, 'New', new_result.success, new_result.images,
-                new_result.internal_links, new_result.external_links,
-                new_result.markdown_length, f"{new_result.execution_time:.3f}"
+                name,
+                "New",
+                new_result.success,
+                new_result.images,
+                new_result.internal_links,
+                new_result.external_links,
+                new_result.markdown_length,
+                f"{new_result.execution_time:.3f}",
             ]
             table_data.append(new_row)
-            
+
             # Add row for current strategy
             current_row = [
-                '', 'Current', current_result.success, current_result.images,
-                current_result.internal_links, current_result.external_links,
-                current_result.markdown_length, f"{current_result.execution_time:.3f}"
+                "",
+                "Current",
+                current_result.success,
+                current_result.images,
+                current_result.internal_links,
+                current_result.external_links,
+                current_result.markdown_length,
+                f"{current_result.execution_time:.3f}",
             ]
             table_data.append(current_row)
-            
+
             # Add difference summary if any
             if differences:
-                table_data.append(['', '⚠️ Differences', ', '.join(differences), '', '', '', '', ''])
-            
+                table_data.append(
+                    ["", "⚠️ Differences", ", ".join(differences), "", "", "", "", ""]
+                )
+
             # Add empty row for better readability
-            table_data.append([''] * len(headers))
+            table_data.append([""] * len(headers))
 
         print("\nStrategy Comparison Results:")
-        print(tabulate(table_data, headers=headers, tablefmt='grid'))
+        print(tabulate(table_data, headers=headers, tablefmt="grid"))
+
 
 if __name__ == "__main__":
     tester = StrategyTester()
-    tester.run_all_tests()
\ No newline at end of file
+    tester.run_all_tests()
diff --git a/tests/async/test_crawler_strategy.py b/tests/async/test_crawler_strategy.py
index a507058d..337b5aaa 100644
--- a/tests/async/test_crawler_strategy.py
+++ b/tests/async/test_crawler_strategy.py
@@ -1,14 +1,13 @@
 import os
 import sys
 import pytest
-import asyncio
 
 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
 
 from crawl4ai.async_webcrawler import AsyncWebCrawler
-from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
+
 
 @pytest.mark.asyncio
 async def test_custom_user_agent():
@@ -20,6 +19,7 @@ async def test_custom_user_agent():
         assert result.success
         assert custom_user_agent in result.html
 
+
 @pytest.mark.asyncio
 async def test_custom_headers():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -31,6 +31,7 @@ async def test_custom_headers():
         assert "X-Test-Header" in result.html
         assert "TestValue" in result.html
 
+
 @pytest.mark.asyncio
 async def test_javascript_execution():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -40,19 +41,22 @@ async def test_javascript_execution():
         assert result.success
         assert "<h1>Modified by JS</h1>" in result.html
 
+
 @pytest.mark.asyncio
 async def test_hook_execution():
     async with AsyncWebCrawler(verbose=True) as crawler:
+
         async def test_hook(page):
             await page.evaluate("document.body.style.backgroundColor = 'red';")
             return page
 
-        crawler.crawler_strategy.set_hook('after_goto', test_hook)
+        crawler.crawler_strategy.set_hook("after_goto", test_hook)
         url = "https://www.example.com"
         result = await crawler.arun(url=url, bypass_cache=True)
         assert result.success
         assert "background-color: red" in result.html
 
+
 @pytest.mark.asyncio
 async def test_screenshot():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -63,6 +67,7 @@ async def test_screenshot():
         assert isinstance(result.screenshot, str)
         assert len(result.screenshot) > 0
 
+
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
\ No newline at end of file
+    pytest.main([__file__, "-v"])
diff --git a/tests/async/test_database_operations.py b/tests/async/test_database_operations.py
index 90a09ff0..db0d328e 100644
--- a/tests/async/test_database_operations.py
+++ b/tests/async/test_database_operations.py
@@ -1,8 +1,6 @@
 import os
 import sys
 import pytest
-import asyncio
-import json
 
 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -10,6 +8,7 @@ sys.path.append(parent_dir)
 
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 
+
 @pytest.mark.asyncio
 async def test_cache_url():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -23,6 +22,7 @@ async def test_cache_url():
         assert result2.success
         assert result2.html == result1.html
 
+
 @pytest.mark.asyncio
 async def test_bypass_cache():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -34,25 +34,29 @@ async def test_bypass_cache():
         # Second run bypassing cache
         result2 = await crawler.arun(url=url, bypass_cache=True)
         assert result2.success
-        assert result2.html != result1.html  # Content might be different due to dynamic nature of websites
+        assert (
+            result2.html != result1.html
+        )  # Content might be different due to dynamic nature of websites
+
 
 @pytest.mark.asyncio
 async def test_cache_size():
     async with AsyncWebCrawler(verbose=True) as crawler:
         initial_size = await crawler.aget_cache_size()
-        
+
         url = "https://www.nbcnews.com/business"
         await crawler.arun(url=url, bypass_cache=True)
-        
+
         new_size = await crawler.aget_cache_size()
         assert new_size == initial_size + 1
 
+
 @pytest.mark.asyncio
 async def test_clear_cache():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.example.org"
         await crawler.arun(url=url, bypass_cache=True)
-        
+
         initial_size = await crawler.aget_cache_size()
         assert initial_size > 0
 
@@ -60,12 +64,13 @@ async def test_clear_cache():
         new_size = await crawler.aget_cache_size()
         assert new_size == 0
 
+
 @pytest.mark.asyncio
 async def test_flush_cache():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.example.net"
         await crawler.arun(url=url, bypass_cache=True)
-        
+
         initial_size = await crawler.aget_cache_size()
         assert initial_size > 0
 
@@ -75,8 +80,11 @@ async def test_flush_cache():
 
         # Try to retrieve the previously cached URL
         result = await crawler.arun(url=url, bypass_cache=False)
-        assert result.success  # The crawler should still succeed, but it will fetch the content anew
+        assert (
+            result.success
+        )  # The crawler should still succeed, but it will fetch the content anew
+
 
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
\ No newline at end of file
+    pytest.main([__file__, "-v"])
diff --git a/tests/async/test_dispatchers.py b/tests/async/test_dispatchers.py
new file mode 100644
index 00000000..99cf4a98
--- /dev/null
+++ b/tests/async/test_dispatchers.py
@@ -0,0 +1,170 @@
+import pytest
+import time
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    MemoryAdaptiveDispatcher,
+    SemaphoreDispatcher,
+    RateLimiter,
+    CrawlerMonitor,
+    DisplayMode,
+    CacheMode,
+)
+
+
+@pytest.fixture
+def browser_config():
+    return BrowserConfig(headless=True, verbose=False)
+
+
+@pytest.fixture
+def run_config():
+    return CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
+
+
+@pytest.fixture
+def test_urls():
+    return [
+        "http://example.com",
+        "http://example.com/page1",
+        "http://example.com/page2",
+    ]
+
+
+@pytest.mark.asyncio
+class TestDispatchStrategies:
+    async def test_memory_adaptive_basic(self, browser_config, run_config, test_urls):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(
+                memory_threshold_percent=70.0, max_session_permit=2, check_interval=0.1
+            )
+            results = await crawler.arun_many(
+                test_urls, config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == len(test_urls)
+            assert all(r.success for r in results)
+
+    async def test_memory_adaptive_with_rate_limit(
+        self, browser_config, run_config, test_urls
+    ):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(
+                memory_threshold_percent=70.0,
+                max_session_permit=2,
+                check_interval=0.1,
+                rate_limiter=RateLimiter(
+                    base_delay=(0.1, 0.2), max_delay=1.0, max_retries=2
+                ),
+            )
+            results = await crawler.arun_many(
+                test_urls, config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == len(test_urls)
+            assert all(r.success for r in results)
+
+    async def test_semaphore_basic(self, browser_config, run_config, test_urls):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = SemaphoreDispatcher(semaphore_count=2)
+            results = await crawler.arun_many(
+                test_urls, config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == len(test_urls)
+            assert all(r.success for r in results)
+
+    async def test_semaphore_with_rate_limit(
+        self, browser_config, run_config, test_urls
+    ):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = SemaphoreDispatcher(
+                semaphore_count=2,
+                rate_limiter=RateLimiter(
+                    base_delay=(0.1, 0.2), max_delay=1.0, max_retries=2
+                ),
+            )
+            results = await crawler.arun_many(
+                test_urls, config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == len(test_urls)
+            assert all(r.success for r in results)
+
+    async def test_memory_adaptive_memory_error(
+        self, browser_config, run_config, test_urls
+    ):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(
+                memory_threshold_percent=1.0,  # Set unrealistically low threshold
+                max_session_permit=2,
+                check_interval=0.1,
+                memory_wait_timeout=1.0,  # Short timeout for testing
+            )
+            with pytest.raises(MemoryError):
+                await crawler.arun_many(
+                    test_urls, config=run_config, dispatcher=dispatcher
+                )
+
+    async def test_empty_urls(self, browser_config, run_config):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
+            results = await crawler.arun_many(
+                [], config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == 0
+
+    async def test_single_url(self, browser_config, run_config):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
+            results = await crawler.arun_many(
+                ["http://example.com"], config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == 1
+            assert results[0].success
+
+    async def test_invalid_urls(self, browser_config, run_config):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
+            results = await crawler.arun_many(
+                ["http://invalid.url.that.doesnt.exist"],
+                config=run_config,
+                dispatcher=dispatcher,
+            )
+            assert len(results) == 1
+            assert not results[0].success
+
+    async def test_rate_limit_backoff(self, browser_config, run_config):
+        urls = ["http://example.com"] * 5  # Multiple requests to same domain
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(
+                max_session_permit=2,
+                rate_limiter=RateLimiter(
+                    base_delay=(0.1, 0.2),
+                    max_delay=1.0,
+                    max_retries=2,
+                    rate_limit_codes=[200],  # Force rate limiting for testing
+                ),
+            )
+            start_time = time.time()
+            results = await crawler.arun_many(
+                urls, config=run_config, dispatcher=dispatcher
+            )
+            duration = time.time() - start_time
+            assert len(results) == len(urls)
+            assert duration > 1.0  # Ensure rate limiting caused delays
+
+    async def test_monitor_integration(self, browser_config, run_config, test_urls):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            monitor = CrawlerMonitor(
+                max_visible_rows=5, display_mode=DisplayMode.DETAILED
+            )
+            dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2, monitor=monitor)
+            results = await crawler.arun_many(
+                test_urls, config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == len(test_urls)
+            # Check monitor stats
+            assert len(monitor.stats) == len(test_urls)
+            assert all(stat.end_time is not None for stat in monitor.stats.values())
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--asyncio-mode=auto"])
diff --git a/tests/async/test_edge_cases.py b/tests/async/test_edge_cases.py
index 34fadb1e..d3adb53c 100644
--- a/tests/async/test_edge_cases.py
+++ b/tests/async/test_edge_cases.py
@@ -2,9 +2,9 @@ import os
 import re
 import sys
 import pytest
-import json
 from bs4 import BeautifulSoup
 import asyncio
+
 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
@@ -59,19 +59,21 @@ from crawl4ai.async_webcrawler import AsyncWebCrawler
 #         assert result.success
 #         assert "github" in result.html.lower()
 
+
 # Add this test to your existing test file
 @pytest.mark.asyncio
 async def test_typescript_commits_multi_page():
     first_commit = ""
+
     async def on_execution_started(page):
-        nonlocal first_commit 
+        nonlocal first_commit
         try:
             # Check if the page firct commit h4 text is different from the first commit (use document.querySelector('li.Box-sc-g0xbh4-0 h4'))
             while True:
-                await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')
-                commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')
-                commit = await commit.evaluate('(element) => element.textContent')
-                commit = re.sub(r'\s+', '', commit)
+                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await commit.evaluate("(element) => element.textContent")
+                commit = re.sub(r"\s+", "", commit)
                 if commit and commit != first_commit:
                     first_commit = commit
                     break
@@ -79,9 +81,8 @@ async def test_typescript_commits_multi_page():
         except Exception as e:
             print(f"Warning: New content didn't appear after JavaScript execution: {e}")
 
-
     async with AsyncWebCrawler(verbose=True) as crawler:
-        crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
 
         url = "https://github.com/microsoft/TypeScript/commits/main"
         session_id = "typescript_commits_session"
@@ -97,19 +98,21 @@ async def test_typescript_commits_multi_page():
                 url=url,  # Only use URL for the first page
                 session_id=session_id,
                 css_selector="li.Box-sc-g0xbh4-0",
-                js=js_next_page if page > 0 else None,  # Don't click 'next' on the first page
+                js=js_next_page
+                if page > 0
+                else None,  # Don't click 'next' on the first page
                 bypass_cache=True,
-                js_only=page > 0  # Use js_only for subsequent pages
+                js_only=page > 0,  # Use js_only for subsequent pages
             )
 
             assert result.success, f"Failed to crawl page {page + 1}"
 
             # Parse the HTML and extract commits
-            soup = BeautifulSoup(result.cleaned_html, 'html.parser')
+            soup = BeautifulSoup(result.cleaned_html, "html.parser")
             commits = soup.select("li")
             # Take first commit find h4 extract text
             first_commit = commits[0].find("h4").text
-            first_commit = re.sub(r'\s+', '', first_commit)
+            first_commit = re.sub(r"\s+", "", first_commit)
             all_commits.extend(commits)
 
             print(f"Page {page + 1}: Found {len(commits)} commits")
@@ -118,10 +121,13 @@ async def test_typescript_commits_multi_page():
         await crawler.crawler_strategy.kill_session(session_id)
 
         # Assertions
-        assert len(all_commits) >= 90, f"Expected at least 90 commits, but got {len(all_commits)}"
-        
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")                      
+        assert (
+            len(all_commits) >= 90
+        ), f"Expected at least 90 commits, but got {len(all_commits)}"
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
 
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
\ No newline at end of file
+    pytest.main([__file__, "-v"])
diff --git a/tests/async/test_error_handling.py b/tests/async/test_error_handling.py
index 3015edbd..ae4af6c8 100644
--- a/tests/async/test_error_handling.py
+++ b/tests/async/test_error_handling.py
@@ -75,4 +75,4 @@
 
 # # Entry point for debugging
 # if __name__ == "__main__":
-#     pytest.main([__file__, "-v"])
\ No newline at end of file
+#     pytest.main([__file__, "-v"])
diff --git a/tests/async/test_evaluation_scraping_methods_performance.configs.py b/tests/async/test_evaluation_scraping_methods_performance.configs.py
new file mode 100644
index 00000000..797cf681
--- /dev/null
+++ b/tests/async/test_evaluation_scraping_methods_performance.configs.py
@@ -0,0 +1,705 @@
+import json
+import time
+from bs4 import BeautifulSoup
+from crawl4ai.content_scraping_strategy import (
+    WebScrapingStrategy,
+    LXMLWebScrapingStrategy,
+)
+from typing import Dict, List, Tuple
+import difflib
+from lxml import html as lhtml, etree
+
+
+def normalize_dom(element):
+    """
+    Recursively normalizes an lxml HTML element:
+      - Removes comment nodes
+      - Sorts attributes on each node
+      - Removes <head> if you want (optional)
+    Returns the same element (mutated).
+    """
+    # Remove comment nodes
+    comments = element.xpath("//comment()")
+    for c in comments:
+        p = c.getparent()
+        if p is not None:
+            p.remove(c)
+
+    # If you'd like to remove <head>, or unify <html>/<body>, you could do so here.
+    # For example, remove <head> entirely:
+    # heads = element.xpath('//head')
+    # for h in heads:
+    #     parent = h.getparent()
+    #     if parent is not None:
+    #         parent.remove(h)
+
+    # Sort attributes (to avoid false positives due to attr order)
+    for el in element.iter():
+        if el.attrib:
+            # Convert to a sorted list of (k, v), then reassign
+            sorted_attribs = sorted(el.attrib.items())
+            el.attrib.clear()
+            for k, v in sorted_attribs:
+                el.set(k, v)
+
+    return element
+
+
+def strip_html_body(root):
+    """
+    If 'root' is <html>, find its <body> child and move all of <body>'s children
+    into a new <div>. Return that <div>.
+
+    If 'root' is <body>, similarly move all of its children into a new <div> and return it.
+
+    Otherwise, return 'root' as-is.
+    """
+    tag_name = (root.tag or "").lower()
+
+    # Case 1: The root is <html>
+    if tag_name == "html":
+        bodies = root.xpath("./body")
+        if bodies:
+            body = bodies[0]
+            new_div = lhtml.Element("div")
+            for child in body:
+                new_div.append(child)
+            return new_div
+        else:
+            # No <body> found; just return the <html> root
+            return root
+
+    # Case 2: The root is <body>
+    elif tag_name == "body":
+        new_div = lhtml.Element("div")
+        for child in root:
+            new_div.append(child)
+        return new_div
+
+    # Case 3: Neither <html> nor <body>
+    else:
+        return root
+
+
+def compare_nodes(node1, node2, differences, path="/"):
+    """
+    Recursively compare two lxml nodes, appending textual differences to `differences`.
+    `path` is used to indicate the location in the tree (like an XPath).
+    """
+    # 1) Compare tag names
+    if node1.tag != node2.tag:
+        differences.append(f"Tag mismatch at {path}: '{node1.tag}' vs. '{node2.tag}'")
+        return
+
+    # 2) Compare attributes
+    # By now, they are sorted in normalize_dom()
+    attrs1 = list(node1.attrib.items())
+    attrs2 = list(node2.attrib.items())
+    if attrs1 != attrs2:
+        differences.append(
+            f"Attribute mismatch at {path}/{node1.tag}: {attrs1} vs. {attrs2}"
+        )
+
+    # 3) Compare text (trim or unify whitespace as needed)
+    text1 = (node1.text or "").strip()
+    text2 = (node2.text or "").strip()
+    # Normalize whitespace
+    text1 = " ".join(text1.split())
+    text2 = " ".join(text2.split())
+    if text1 != text2:
+        # If you prefer ignoring newlines or multiple whitespace, do a more robust cleanup
+        differences.append(
+            f"Text mismatch at {path}/{node1.tag}: '{text1}' vs. '{text2}'"
+        )
+
+    # 4) Compare number of children
+    children1 = list(node1)
+    children2 = list(node2)
+    if len(children1) != len(children2):
+        differences.append(
+            f"Child count mismatch at {path}/{node1.tag}: {len(children1)} vs. {len(children2)}"
+        )
+        return  # If counts differ, no point comparing child by child
+
+    # 5) Recursively compare each child
+    for i, (c1, c2) in enumerate(zip(children1, children2)):
+        # Build a path for child
+        child_path = f"{path}/{node1.tag}[{i}]"
+        compare_nodes(c1, c2, differences, child_path)
+
+    # 6) Compare tail text
+    tail1 = (node1.tail or "").strip()
+    tail2 = (node2.tail or "").strip()
+    if tail1 != tail2:
+        differences.append(
+            f"Tail mismatch after {path}/{node1.tag}: '{tail1}' vs. '{tail2}'"
+        )
+
+
+def compare_html_structurally(html1, html2):
+    """
+    Compare two HTML strings using a structural approach with lxml.
+    Returns a list of differences (if any). If empty, they're effectively the same.
+    """
+    # 1) Parse both
+    try:
+        tree1 = lhtml.fromstring(html1)
+    except etree.ParserError:
+        return ["Error parsing HTML1"]
+
+    try:
+        tree2 = lhtml.fromstring(html2)
+    except etree.ParserError:
+        return ["Error parsing HTML2"]
+
+    # 2) Normalize both DOMs (remove comments, sort attributes, etc.)
+    tree1 = normalize_dom(tree1)
+    tree2 = normalize_dom(tree2)
+
+    # 3) Possibly strip <html>/<body> wrappers for better apples-to-apples comparison
+    tree1 = strip_html_body(tree1)
+    tree2 = strip_html_body(tree2)
+
+    # 4) Compare recursively
+    differences = []
+    compare_nodes(tree1, tree2, differences, path="")
+    return differences
+
+
+def generate_large_html(n_elements=1000):
+    html = ["<!DOCTYPE html><html><head></head><body>"]
+    for i in range(n_elements):
+        html.append(
+            f"""
+            <div class="article">
+                <h2>Heading {i}</h2>
+                <p>This is paragraph {i} with some content and a <a href="http://example.com/{i}">link</a></p>
+                <img src="image{i}.jpg" alt="Image {i}">
+                <ul>
+                    <li>List item {i}.1</li>
+                    <li>List item {i}.2</li>
+                </ul>
+            </div>
+        """
+        )
+    html.append("</body></html>")
+    return "".join(html)
+
+
+def generate_complicated_html():
+    """
+    HTML with multiple domains, forms, data attributes,
+    various images, comments, style, and noscript to test all parameter toggles.
+    """
+    return """
+    <!DOCTYPE html>
+    <html>
+      <head>
+        <title>Complicated Test Page</title>
+        <meta name="description" content="A very complicated page for testing.">
+        
+        <style>
+          .hidden { display: none; }
+          .highlight { color: red; }
+        </style>
+      </head>
+      <body>
+        <!-- This is a comment that we may remove if remove_comments=True -->
+        
+        <header>
+          <h1>Main Title of the Page</h1>
+          <nav>
+            <a href="http://example.com/home">Home</a>
+            <a href="http://social.com/profile">Social Profile</a>
+            <a href="javascript:void(0)">JS Void Link</a>
+          </nav>
+        </header>
+        
+        <noscript>
+          <p>JavaScript is disabled or not supported.</p>
+        </noscript>
+        
+        <form action="submit.php" method="post">
+          <input type="text" name="username" />
+          <button type="submit">Submit</button>
+        </form>
+        
+        <section>
+          <article>
+            <h2>Article Title</h2>
+            <p>
+              This paragraph has a good amount of text to exceed word_count_threshold if it's 
+              set to something small. But it might not exceed a very high threshold.
+            </p>
+            
+            <img src="http://images.example.com/photo.jpg" alt="Descriptive alt text"
+                 style="width:200px;height:150px;" data-lazy="true">
+            
+            <img src="icon.png" alt="Icon" style="display:none;">
+            
+            <p>Another short text. <a href="/local-link">Local Link</a></p>
+          </article>
+        </section>
+        
+        <section id="promo-section">
+          <p>Promo text <a href="http://ads.example.com/ad">Ad Link</a></p>
+        </section>
+        
+        <aside class="sidebar">
+          <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA..." alt="Base64 Image">
+          <div data-info="secret" class="social-widget">
+            <p>Follow us on <a href="http://facebook.com/brand">Facebook</a></p>
+          </div>
+        </aside>
+        
+        <!-- Another comment below this line -->
+        <script>console.log("script that might be removed");</script>
+        
+        <div style="display:none;">
+          <p>This is hidden</p>
+        </div>
+        
+        <footer>
+          <small>Footer Info &copy; 2025</small>
+        </footer>
+      </body>
+    </html>
+    """
+
+
+def get_test_scenarios():
+    """
+    Returns a dictionary of parameter sets (test scenarios) for the scraper.
+    Each scenario name maps to a dictionary of keyword arguments
+    that will be passed into scrap() for testing various features.
+    """
+    TEST_SCENARIOS = {
+        # "default": {},
+        # "exclude_domains": {
+        #     "exclude_domains": {"images.example.com", "ads.example.com"}
+        # },
+        # "exclude_social_media_links": {
+        #     "exclude_social_media_links": True
+        # },
+        # "high_word_threshold": {
+        #     "word_count_threshold": 100
+        # },
+        # "keep_data_attrs": {
+        #     "keep_data_attributes": True
+        # },
+        # "remove_forms_and_comments": {
+        #     "remove_forms": True,
+        #     "remove_comments": True
+        # },
+        # "exclude_tags_and_selector": {
+        #     "excluded_tags": ["aside", "script"],
+        #     "excluded_selector": ".social-widget"
+        # },
+        # "only_text_mode": {
+        #     "only_text": True
+        # },
+        # "combo_mode": {
+        #     "exclude_domains": {"images.example.com", "ads.example.com"},
+        #     "exclude_social_media_links": True,
+        #     "remove_forms": True,
+        #     "remove_comments": True,
+        #     "excluded_tags": ["aside"],
+        #     "excluded_selector": "#promo-section",
+        #     "only_text": False,
+        #     "keep_data_attributes": True,
+        #     "word_count_threshold": 20
+        # },
+        # "exclude_external_images": {
+        #     "exclude_external_images": True,
+        #     "exclude_social_media_links": True
+        # },
+        # "strict_image_scoring": {
+        #     "image_score_threshold": 3,
+        #     "image_description_min_word_threshold": 10
+        # },
+        # "custom_css_selector": {
+        #     "css_selector": "section#promo-section"
+        # },
+        # "remove_noscript": {
+        #     "excluded_tags": ["noscript"]
+        # },
+        # "exclude_external_links": {
+        #     "exclude_external_links": True
+        # },
+        # "large_word_count": {
+        #     "word_count_threshold": 500
+        # },
+        # "super_strict_images": {
+        #     "image_score_threshold": 5,
+        #     "image_description_min_word_threshold": 15
+        # },
+        # "exclude_style_and_script": {
+        #     "excluded_tags": ["style", "script"]
+        # },
+        # "keep_data_and_remove_forms": {
+        #     "keep_data_attributes": True,
+        #     "remove_forms": True
+        # },
+        # "only_text_high_word_count": {
+        #     "only_text": True,
+        #     "word_count_threshold": 40
+        # },
+        # "reduce_to_selector": {
+        #     "css_selector": "section > article"
+        # },
+        # "exclude_all_links": {
+        #     # Removes all external links and also excludes example.com & social.com
+        #     "exclude_domains": {"example.com", "social.com", "facebook.com"},
+        #     "exclude_external_links": True
+        # },
+        # "comprehensive_removal": {
+        #     # Exclude multiple tags, remove forms & comments,
+        #     # and also remove targeted selectors
+        #     "excluded_tags": ["aside", "noscript", "script"],
+        #     "excluded_selector": "#promo-section, .social-widget",
+        #     "remove_comments": True,
+        #     "remove_forms": True
+        # }
+    }
+    return TEST_SCENARIOS
+
+
+class ScraperEquivalenceTester:
+    def __init__(self):
+        self.test_cases = {
+            "basic": self.generate_basic_html(),
+            "complex": self.generate_complex_html(),
+            "malformed": self.generate_malformed_html(),
+            # 'real_world': self.load_real_samples()
+        }
+
+    def generate_basic_html(self):
+        return generate_large_html(1000)  # Your existing function
+
+    def generate_complex_html(self):
+        return """
+        <html><body>
+            <div class="nested-content">
+                <article>
+                    <h1>Main Title</h1>
+                    <img src="test.jpg" srcset="test-1x.jpg 1x, test-2x.jpg 2x" data-src="lazy.jpg">
+                    <p>Text with <a href="http://test.com">mixed <b>formatting</b></a></p>
+                    <iframe src="embedded.html"></iframe>
+                </article>
+                <nav>
+                    <ul>
+                        <li><a href="/page1">Link 1</a></li>
+                        <li><a href="javascript:void(0)">JS Link</a></li>
+                    </ul>
+                </nav>
+            </div>
+        </body></html>
+        """
+
+    def generate_malformed_html(self):
+        return """
+        <div>Unclosed div
+        <p>Unclosed paragraph
+        <a href="test.com">Link</a>
+        <img src=no-quotes>
+        <script>document.write("<div>Dynamic</div>");</script>
+        <!-- Malformed comment -- > -->
+        <![CDATA[Test CDATA]]>
+        """
+
+    def load_real_samples(self):
+        # Load some real-world HTML samples you've collected
+        samples = {
+            "article": open("tests/samples/article.html").read(),
+            "product": open("tests/samples/product.html").read(),
+            "blog": open("tests/samples/blog.html").read(),
+        }
+        return samples
+
+    def deep_compare_links(self, old_links: Dict, new_links: Dict) -> List[str]:
+        """Detailed comparison of link structures"""
+        differences = []
+
+        for category in ["internal", "external"]:
+            old_urls = {link["href"] for link in old_links[category]}
+            new_urls = {link["href"] for link in new_links[category]}
+
+            missing = old_urls - new_urls
+            extra = new_urls - old_urls
+
+            if missing:
+                differences.append(f"Missing {category} links: {missing}")
+            if extra:
+                differences.append(f"Extra {category} links: {extra}")
+
+            # Compare link attributes for common URLs
+            common = old_urls & new_urls
+            for url in common:
+                old_link = next(l for l in old_links[category] if l["href"] == url)
+                new_link = next(l for l in new_links[category] if l["href"] == url)
+
+                for attr in ["text", "title"]:
+                    if old_link[attr] != new_link[attr]:
+                        differences.append(
+                            f"Link attribute mismatch for {url} - {attr}:"
+                            f" old='{old_link[attr]}' vs new='{new_link[attr]}'"
+                        )
+
+        return differences
+
+    def deep_compare_media(self, old_media: Dict, new_media: Dict) -> List[str]:
+        """Detailed comparison of media elements"""
+        differences = []
+
+        for media_type in ["images", "videos", "audios"]:
+            old_srcs = {item["src"] for item in old_media[media_type]}
+            new_srcs = {item["src"] for item in new_media[media_type]}
+
+            missing = old_srcs - new_srcs
+            extra = new_srcs - old_srcs
+
+            if missing:
+                differences.append(f"Missing {media_type}: {missing}")
+            if extra:
+                differences.append(f"Extra {media_type}: {extra}")
+
+            # Compare media attributes for common sources
+            common = old_srcs & new_srcs
+            for src in common:
+                old_item = next(m for m in old_media[media_type] if m["src"] == src)
+                new_item = next(m for m in new_media[media_type] if m["src"] == src)
+
+                for attr in ["alt", "description"]:
+                    if old_item.get(attr) != new_item.get(attr):
+                        differences.append(
+                            f"{media_type} attribute mismatch for {src} - {attr}:"
+                            f" old='{old_item.get(attr)}' vs new='{new_item.get(attr)}'"
+                        )
+
+        return differences
+
+    def compare_html_content(self, old_html: str, new_html: str) -> List[str]:
+        """Compare HTML content structure and text"""
+        # return compare_html_structurally(old_html, new_html)
+        differences = []
+
+        def normalize_html(html: str) -> Tuple[str, str]:
+            soup = BeautifulSoup(html, "lxml")
+            # Get both structure and text
+            structure = " ".join(tag.name for tag in soup.find_all())
+            text = " ".join(soup.get_text().split())
+            return structure, text
+
+        old_structure, old_text = normalize_html(old_html)
+        new_structure, new_text = normalize_html(new_html)
+
+        # Compare structure
+        if abs(len(old_structure) - len(new_structure)) > 100:
+            # if old_structure != new_structure:
+            diff = difflib.unified_diff(
+                old_structure.split(), new_structure.split(), lineterm=""
+            )
+            differences.append("HTML structure differences:\n" + "\n".join(diff))
+
+        # Compare text content
+        if abs(len(old_text) - len(new_text)) > 100:
+            # if old_text != new_text:
+            # Show detailed text differences
+            text_diff = difflib.unified_diff(
+                old_text.split(), new_text.split(), lineterm=""
+            )
+            differences.append("Text content differences:\n" + "\n".join(text_diff))
+
+        return differences
+
+    def compare_results(
+        self, old_result: Dict, new_result: Dict
+    ) -> Dict[str, List[str]]:
+        """Comprehensive comparison of scraper outputs"""
+        differences = {}
+
+        # Compare links
+        link_differences = self.deep_compare_links(
+            old_result["links"], new_result["links"]
+        )
+        if link_differences:
+            differences["links"] = link_differences
+
+        # Compare media
+        media_differences = self.deep_compare_media(
+            old_result["media"], new_result["media"]
+        )
+        if media_differences:
+            differences["media"] = media_differences
+
+        # Compare HTML
+        html_differences = self.compare_html_content(
+            old_result["cleaned_html"], new_result["cleaned_html"]
+        )
+        if html_differences:
+            differences["html"] = html_differences
+
+        return differences
+
+    def run_tests(self) -> Dict:
+        """Run comparison tests using the complicated HTML with multiple parameter scenarios."""
+        # We'll still keep some "test_cases" logic from above (basic, complex, malformed).
+        # But we add a new section for the complicated HTML scenarios.
+
+        results = {"tests": [], "summary": {"passed": 0, "failed": 0}}
+
+        # 1) First, run the existing 3 built-in test cases (basic, complex, malformed).
+        # for case_name, html in self.test_cases.items():
+        #     print(f"\nTesting built-in case: {case_name}...")
+
+        #     original = WebScrapingStrategy()
+        #     lxml = LXMLWebScrapingStrategy()
+
+        #     start = time.time()
+        #     orig_result = original.scrap("http://test.com", html)
+        #     orig_time = time.time() - start
+
+        #     print("\nOriginal Mode:")
+        #     print(f"Cleaned HTML size: {len(orig_result['cleaned_html'])/1024:.2f} KB")
+        #     print(f"Images: {len(orig_result['media']['images'])}")
+        #     print(f"External links: {len(orig_result['links']['external'])}")
+        #     print(f"Times - Original: {orig_time:.3f}s")
+
+        #     start = time.time()
+        #     lxml_result = lxml.scrap("http://test.com", html)
+        #     lxml_time = time.time() - start
+
+        #     print("\nLXML Mode:")
+        #     print(f"Cleaned HTML size: {len(lxml_result['cleaned_html'])/1024:.2f} KB")
+        #     print(f"Images: {len(lxml_result['media']['images'])}")
+        #     print(f"External links: {len(lxml_result['links']['external'])}")
+        #     print(f"Times - LXML: {lxml_time:.3f}s")
+
+        #     # Compare
+        #     diffs = {}
+        #     link_diff = self.deep_compare_links(orig_result['links'], lxml_result['links'])
+        #     if link_diff:
+        #         diffs['links'] = link_diff
+
+        #     media_diff = self.deep_compare_media(orig_result['media'], lxml_result['media'])
+        #     if media_diff:
+        #         diffs['media'] = media_diff
+
+        #     html_diff = self.compare_html_content(orig_result['cleaned_html'], lxml_result['cleaned_html'])
+        #     if html_diff:
+        #         diffs['html'] = html_diff
+
+        #     test_result = {
+        #         'case': case_name,
+        #         'lxml_mode': {
+        #             'differences': diffs,
+        #             'execution_time': lxml_time
+        #         },
+        #         'original_time': orig_time
+        #     }
+        #     results['tests'].append(test_result)
+
+        #     if not diffs:
+        #         results['summary']['passed'] += 1
+        #     else:
+        #         results['summary']['failed'] += 1
+
+        # 2) Now, run the complicated HTML with multiple parameter scenarios.
+        complicated_html = generate_complicated_html()
+        print("\n=== Testing complicated HTML with multiple parameter scenarios ===")
+
+        # Create the scrapers once (or you can re-create if needed)
+        original = WebScrapingStrategy()
+        lxml = LXMLWebScrapingStrategy()
+
+        for scenario_name, params in get_test_scenarios().items():
+            print(f"\nScenario: {scenario_name}")
+
+            start = time.time()
+            orig_result = original.scrap("http://test.com", complicated_html, **params)
+            orig_time = time.time() - start
+
+            start = time.time()
+            lxml_result = lxml.scrap("http://test.com", complicated_html, **params)
+            lxml_time = time.time() - start
+
+            diffs = {}
+            link_diff = self.deep_compare_links(
+                orig_result["links"], lxml_result["links"]
+            )
+            if link_diff:
+                diffs["links"] = link_diff
+
+            media_diff = self.deep_compare_media(
+                orig_result["media"], lxml_result["media"]
+            )
+            if media_diff:
+                diffs["media"] = media_diff
+
+            html_diff = self.compare_html_content(
+                orig_result["cleaned_html"], lxml_result["cleaned_html"]
+            )
+            if html_diff:
+                diffs["html"] = html_diff
+
+            test_result = {
+                "case": f"complicated_{scenario_name}",
+                "lxml_mode": {"differences": diffs, "execution_time": lxml_time},
+                "original_time": orig_time,
+            }
+            results["tests"].append(test_result)
+
+            if not diffs:
+                results["summary"]["passed"] += 1
+                print(
+                    f"✅ [OK] No differences found. Time(Orig: {orig_time:.3f}s, LXML: {lxml_time:.3f}s)"
+                )
+            else:
+                results["summary"]["failed"] += 1
+                print("❌ Differences found:")
+                for category, dlist in diffs.items():
+                    print(f"  {category}:")
+                    for d in dlist:
+                        print(f"    - {d}")
+
+        return results
+
+    def print_report(self, results: Dict):
+        """Generate detailed equivalence report"""
+        print("\n=== Scraper Equivalence Test Report ===\n")
+        print(f"Total Cases: {len(results['tests'])}")
+        print(f"Passed: {results['summary']['passed']}")
+        print(f"Failed: {results['summary']['failed']}")
+
+        for test in results["tests"]:
+            print(f"\nTest Case: {test['case']}")
+
+            if not test["lxml_mode"]["differences"]:
+                print("✅ All implementations produced identical results")
+                print(
+                    f"Times - Original: {test['original_time']:.3f}s, "
+                    f"LXML: {test['lxml_mode']['execution_time']:.3f}s"
+                )
+            else:
+                print("❌ Differences found:")
+
+                if test["lxml_mode"]["differences"]:
+                    print("\nLXML Mode Differences:")
+                    for category, diffs in test["lxml_mode"]["differences"].items():
+                        print(f"\n{category}:")
+                        for diff in diffs:
+                            print(f"  - {diff}")
+
+
+def main():
+    tester = ScraperEquivalenceTester()
+    results = tester.run_tests()
+    tester.print_report(results)
+
+    # Save detailed results for debugging
+    with open("scraper_equivalence_results.json", "w") as f:
+        json.dump(results, f, indent=2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/async/test_markdown_genertor.py b/tests/async/test_markdown_genertor.py
index 2b1102ab..7eaf5d85 100644
--- a/tests/async/test_markdown_genertor.py
+++ b/tests/async/test_markdown_genertor.py
@@ -4,10 +4,10 @@
 # - **State:** open
 
 import os, sys, time
+
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
-__location__ = os.path.realpath(    os.path.join(os.getcwd(), os.path.dirname(__file__)))
-import asyncio
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 import os
 import time
 from typing import Dict, Any
@@ -16,18 +16,18 @@ from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 # Get current directory
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 
+
 def print_test_result(name: str, result: Dict[str, Any], execution_time: float):
     """Helper function to print test results."""
     print(f"\n{'='*20} {name} {'='*20}")
     print(f"Execution time: {execution_time:.4f} seconds")
-    
-    
+
     # Save markdown to files
     for key, content in result.items():
         if isinstance(content, str):
             with open(__location__ + f"/output/{name.lower()}_{key}.md", "w") as f:
                 f.write(content)
-    
+
     # # Print first few lines of each markdown version
     # for key, content in result.items():
     #     if isinstance(content, str):
@@ -36,32 +36,39 @@ def print_test_result(name: str, result: Dict[str, Any], execution_time: float):
     #         print(preview)
     #         print(f"Total length: {len(content)} characters")
 
+
 def test_basic_markdown_conversion():
     """Test basic markdown conversion with links."""
     with open(__location__ + "/data/wikipedia.html", "r") as f:
         cleaned_html = f.read()
 
     generator = DefaultMarkdownGenerator()
-    
+
     start_time = time.perf_counter()
     result = generator.generate_markdown(
-        cleaned_html=cleaned_html,
-        base_url="https://en.wikipedia.org"
+        cleaned_html=cleaned_html, base_url="https://en.wikipedia.org"
     )
     execution_time = time.perf_counter() - start_time
-    
-    print_test_result("Basic Markdown Conversion", {
-        'raw': result.raw_markdown,
-        'with_citations': result.markdown_with_citations,
-        'references': result.references_markdown
-    }, execution_time)
-    
+
+    print_test_result(
+        "Basic Markdown Conversion",
+        {
+            "raw": result.raw_markdown,
+            "with_citations": result.markdown_with_citations,
+            "references": result.references_markdown,
+        },
+        execution_time,
+    )
+
     # Basic assertions
     assert result.raw_markdown, "Raw markdown should not be empty"
     assert result.markdown_with_citations, "Markdown with citations should not be empty"
     assert result.references_markdown, "References should not be empty"
     assert "⟨" in result.markdown_with_citations, "Citations should use ⟨⟩ brackets"
-    assert "## References" in result.references_markdown, "Should contain references section"
+    assert (
+        "## References" in result.references_markdown
+    ), "Should contain references section"
+
 
 def test_relative_links():
     """Test handling of relative links with base URL."""
@@ -69,97 +76,106 @@ def test_relative_links():
     Here's a [relative link](/wiki/Apple) and an [absolute link](https://example.com).
     Also an [image](/images/test.png) and another [page](/wiki/Banana).
     """
-    
+
     generator = DefaultMarkdownGenerator()
     result = generator.generate_markdown(
-        cleaned_html=markdown,
-        base_url="https://en.wikipedia.org"
+        cleaned_html=markdown, base_url="https://en.wikipedia.org"
     )
-    
+
     assert "https://en.wikipedia.org/wiki/Apple" in result.references_markdown
     assert "https://example.com" in result.references_markdown
     assert "https://en.wikipedia.org/images/test.png" in result.references_markdown
 
+
 def test_duplicate_links():
     """Test handling of duplicate links."""
     markdown = """
     Here's a [link](/test) and another [link](/test) and a [different link](/other).
     """
-    
+
     generator = DefaultMarkdownGenerator()
     result = generator.generate_markdown(
-        cleaned_html=markdown,
-        base_url="https://example.com"
+        cleaned_html=markdown, base_url="https://example.com"
     )
-    
+
     # Count citations in markdown
     citations = result.markdown_with_citations.count("⟨1⟩")
     assert citations == 2, "Same link should use same citation number"
 
+
 def test_link_descriptions():
     """Test handling of link titles and descriptions."""
     markdown = """
     Here's a [link with title](/test "Test Title") and a [link with description](/other) to test.
     """
-    
+
     generator = DefaultMarkdownGenerator()
     result = generator.generate_markdown(
-        cleaned_html=markdown,
-        base_url="https://example.com"
+        cleaned_html=markdown, base_url="https://example.com"
     )
-    
-    assert "Test Title" in result.references_markdown, "Link title should be in references"
-    assert "link with description" in result.references_markdown, "Link text should be in references"
+
+    assert (
+        "Test Title" in result.references_markdown
+    ), "Link title should be in references"
+    assert (
+        "link with description" in result.references_markdown
+    ), "Link text should be in references"
+
 
 def test_performance_large_document():
     """Test performance with large document."""
     with open(__location__ + "/data/wikipedia.md", "r") as f:
         markdown = f.read()
-    
+
     # Test with multiple iterations
     iterations = 5
     times = []
-    
+
     generator = DefaultMarkdownGenerator()
-    
+
     for i in range(iterations):
         start_time = time.perf_counter()
         result = generator.generate_markdown(
-            cleaned_html=markdown,
-            base_url="https://en.wikipedia.org"
+            cleaned_html=markdown, base_url="https://en.wikipedia.org"
         )
         end_time = time.perf_counter()
         times.append(end_time - start_time)
-    
+
     avg_time = sum(times) / len(times)
     print(f"\n{'='*20} Performance Test {'='*20}")
-    print(f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds")
+    print(
+        f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds"
+    )
     print(f"Min time: {min(times):.4f} seconds")
     print(f"Max time: {max(times):.4f} seconds")
 
+
 def test_image_links():
     """Test handling of image links."""
     markdown = """
     Here's an ![image](/image.png "Image Title") and another ![image](/other.jpg).
     And a regular [link](/page).
     """
-    
+
     generator = DefaultMarkdownGenerator()
     result = generator.generate_markdown(
-        cleaned_html=markdown,
-        base_url="https://example.com"
+        cleaned_html=markdown, base_url="https://example.com"
     )
-    
-    assert "![" in result.markdown_with_citations, "Image markdown syntax should be preserved"
-    assert "Image Title" in result.references_markdown, "Image title should be in references"
+
+    assert (
+        "![" in result.markdown_with_citations
+    ), "Image markdown syntax should be preserved"
+    assert (
+        "Image Title" in result.references_markdown
+    ), "Image title should be in references"
+
 
 if __name__ == "__main__":
     print("Running markdown generation strategy tests...")
-    
+
     test_basic_markdown_conversion()
     test_relative_links()
     test_duplicate_links()
     test_link_descriptions()
     test_performance_large_document()
     test_image_links()
-    
\ No newline at end of file
diff --git a/tests/async/test_parameters_and_options.py b/tests/async/test_parameters_and_options.py
index 8ae7c1d3..e153fbd3 100644
--- a/tests/async/test_parameters_and_options.py
+++ b/tests/async/test_parameters_and_options.py
@@ -1,8 +1,6 @@
 import os
 import sys
 import pytest
-import asyncio
-import json
 
 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -10,24 +8,37 @@ sys.path.append(parent_dir)
 
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 
+
 @pytest.mark.asyncio
 async def test_word_count_threshold():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
-        result_no_threshold = await crawler.arun(url=url, word_count_threshold=0, bypass_cache=True)
-        result_with_threshold = await crawler.arun(url=url, word_count_threshold=50, bypass_cache=True)
-        
+        result_no_threshold = await crawler.arun(
+            url=url, word_count_threshold=0, bypass_cache=True
+        )
+        result_with_threshold = await crawler.arun(
+            url=url, word_count_threshold=50, bypass_cache=True
+        )
+
         assert len(result_no_threshold.markdown) > len(result_with_threshold.markdown)
 
+
 @pytest.mark.asyncio
 async def test_css_selector():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
         css_selector = "h1, h2, h3"
-        result = await crawler.arun(url=url, css_selector=css_selector, bypass_cache=True)
-        
+        result = await crawler.arun(
+            url=url, css_selector=css_selector, bypass_cache=True
+        )
+
         assert result.success
-        assert "<h1" in result.cleaned_html or "<h2" in result.cleaned_html or "<h3" in result.cleaned_html
+        assert (
+            "<h1" in result.cleaned_html
+            or "<h2" in result.cleaned_html
+            or "<h3" in result.cleaned_html
+        )
+
 
 @pytest.mark.asyncio
 async def test_javascript_execution():
@@ -36,59 +47,70 @@ async def test_javascript_execution():
 
         # Crawl without JS
         result_without_more = await crawler.arun(url=url, bypass_cache=True)
-        
-        js_code = ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"]
+
+        js_code = [
+            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
+        ]
         result_with_more = await crawler.arun(url=url, js=js_code, bypass_cache=True)
-        
+
         assert result_with_more.success
         assert len(result_with_more.markdown) > len(result_without_more.markdown)
 
+
 @pytest.mark.asyncio
 async def test_screenshot():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
         result = await crawler.arun(url=url, screenshot=True, bypass_cache=True)
-        
+
         assert result.success
         assert result.screenshot
         assert isinstance(result.screenshot, str)  # Should be a base64 encoded string
 
+
 @pytest.mark.asyncio
 async def test_custom_user_agent():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
         custom_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Crawl4AI/1.0"
-        result = await crawler.arun(url=url, user_agent=custom_user_agent, bypass_cache=True)
-        
+        result = await crawler.arun(
+            url=url, user_agent=custom_user_agent, bypass_cache=True
+        )
+
         assert result.success
         # Note: We can't directly verify the user agent in the result, but we can check if the crawl was successful
 
+
 @pytest.mark.asyncio
 async def test_extract_media_and_links():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
         result = await crawler.arun(url=url, bypass_cache=True)
-        
+
         assert result.success
         assert result.media
         assert isinstance(result.media, dict)
-        assert 'images' in result.media
+        assert "images" in result.media
         assert result.links
         assert isinstance(result.links, dict)
-        assert 'internal' in result.links and 'external' in result.links
+        assert "internal" in result.links and "external" in result.links
+
 
 @pytest.mark.asyncio
 async def test_metadata_extraction():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
         result = await crawler.arun(url=url, bypass_cache=True)
-        
+
         assert result.success
         assert result.metadata
         assert isinstance(result.metadata, dict)
         # Check for common metadata fields
-        assert any(key in result.metadata for key in ['title', 'description', 'keywords'])
+        assert any(
+            key in result.metadata for key in ["title", "description", "keywords"]
+        )
+
 
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
\ No newline at end of file
+    pytest.main([__file__, "-v"])
diff --git a/tests/async/test_performance.py b/tests/async/test_performance.py
index 9528b5ab..a35e2bee 100644
--- a/tests/async/test_performance.py
+++ b/tests/async/test_performance.py
@@ -1,7 +1,6 @@
 import os
 import sys
 import pytest
-import asyncio
 import time
 
 # Add the parent directory to the Python path
@@ -10,6 +9,7 @@ sys.path.append(parent_dir)
 
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 
+
 @pytest.mark.asyncio
 async def test_crawl_speed():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -17,13 +17,14 @@ async def test_crawl_speed():
         start_time = time.time()
         result = await crawler.arun(url=url, bypass_cache=True)
         end_time = time.time()
-        
+
         assert result.success
         crawl_time = end_time - start_time
         print(f"Crawl time: {crawl_time:.2f} seconds")
-        
+
         assert crawl_time < 10, f"Crawl took too long: {crawl_time:.2f} seconds"
 
+
 @pytest.mark.asyncio
 async def test_concurrent_crawling_performance():
     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -32,41 +33,47 @@ async def test_concurrent_crawling_performance():
             "https://www.example.com",
             "https://www.python.org",
             "https://www.github.com",
-            "https://www.stackoverflow.com"
+            "https://www.stackoverflow.com",
         ]
-        
+
         start_time = time.time()
         results = await crawler.arun_many(urls=urls, bypass_cache=True)
         end_time = time.time()
-        
+
         total_time = end_time - start_time
         print(f"Total time for concurrent crawling: {total_time:.2f} seconds")
-        
+
         assert all(result.success for result in results)
         assert len(results) == len(urls)
-        
-        assert total_time < len(urls) * 5, f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
+
+        assert (
+            total_time < len(urls) * 5
+        ), f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
+
 
 @pytest.mark.asyncio
 async def test_crawl_speed_with_caching():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
-        
+
         start_time = time.time()
         result1 = await crawler.arun(url=url, bypass_cache=True)
         end_time = time.time()
         first_crawl_time = end_time - start_time
-        
+
         start_time = time.time()
         result2 = await crawler.arun(url=url, bypass_cache=False)
         end_time = time.time()
         second_crawl_time = end_time - start_time
-        
+
         assert result1.success and result2.success
         print(f"First crawl time: {first_crawl_time:.2f} seconds")
         print(f"Second crawl time (cached): {second_crawl_time:.2f} seconds")
-        
-        assert second_crawl_time < first_crawl_time / 2, "Cached crawl not significantly faster"
+
+        assert (
+            second_crawl_time < first_crawl_time / 2
+        ), "Cached crawl not significantly faster"
+
 
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
\ No newline at end of file
+    pytest.main([__file__, "-v"])
diff --git a/tests/async/test_screenshot.py b/tests/async/test_screenshot.py
index 0c4439f6..36c6c0aa 100644
--- a/tests/async/test_screenshot.py
+++ b/tests/async/test_screenshot.py
@@ -1,7 +1,6 @@
 import os
 import sys
 import pytest
-import asyncio
 import base64
 from PIL import Image
 import io
@@ -12,113 +11,112 @@ sys.path.append(parent_dir)
 
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 
+
 @pytest.mark.asyncio
 async def test_basic_screenshot():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://example.com"  # A static website
         result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
-        
+
         assert result.success
         assert result.screenshot is not None
-        
+
         # Verify the screenshot is a valid image
         image_data = base64.b64decode(result.screenshot)
         image = Image.open(io.BytesIO(image_data))
         assert image.format == "PNG"
 
+
 @pytest.mark.asyncio
 async def test_screenshot_with_wait_for():
     async with AsyncWebCrawler(verbose=True) as crawler:
         # Using a website with dynamic content
         url = "https://www.youtube.com"
         wait_for = "css:#content"  # Wait for the main content to load
-        
+
         result = await crawler.arun(
-            url=url, 
-            bypass_cache=True, 
-            screenshot=True, 
-            wait_for=wait_for
+            url=url, bypass_cache=True, screenshot=True, wait_for=wait_for
         )
-        
+
         assert result.success
         assert result.screenshot is not None
-        
+
         # Verify the screenshot is a valid image
         image_data = base64.b64decode(result.screenshot)
         image = Image.open(io.BytesIO(image_data))
         assert image.format == "PNG"
-        
+
         # You might want to add more specific checks here, like image dimensions
         # or even use image recognition to verify certain elements are present
 
+
 @pytest.mark.asyncio
 async def test_screenshot_with_js_wait_for():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.amazon.com"
         wait_for = "js:() => document.querySelector('#nav-logo-sprites') !== null"
-        
+
         result = await crawler.arun(
-            url=url, 
-            bypass_cache=True, 
-            screenshot=True, 
-            wait_for=wait_for
+            url=url, bypass_cache=True, screenshot=True, wait_for=wait_for
         )
-        
+
         assert result.success
         assert result.screenshot is not None
-        
+
         image_data = base64.b64decode(result.screenshot)
         image = Image.open(io.BytesIO(image_data))
         assert image.format == "PNG"
 
+
 @pytest.mark.asyncio
 async def test_screenshot_without_wait_for():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nytimes.com"  # A website with lots of dynamic content
-        
+
         result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
-        
+
         assert result.success
         assert result.screenshot is not None
-        
+
         image_data = base64.b64decode(result.screenshot)
         image = Image.open(io.BytesIO(image_data))
         assert image.format == "PNG"
 
+
 @pytest.mark.asyncio
 async def test_screenshot_comparison():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.reddit.com"
         wait_for = "css:#SHORTCUT_FOCUSABLE_DIV"
-        
+
         # Take screenshot without wait_for
         result_without_wait = await crawler.arun(
-            url=url, 
-            bypass_cache=True, 
-            screenshot=True
+            url=url, bypass_cache=True, screenshot=True
         )
-        
+
         # Take screenshot with wait_for
         result_with_wait = await crawler.arun(
-            url=url, 
-            bypass_cache=True, 
-            screenshot=True, 
-            wait_for=wait_for
+            url=url, bypass_cache=True, screenshot=True, wait_for=wait_for
         )
-        
+
         assert result_without_wait.success and result_with_wait.success
         assert result_without_wait.screenshot is not None
         assert result_with_wait.screenshot is not None
-        
+
         # Compare the two screenshots
-        image_without_wait = Image.open(io.BytesIO(base64.b64decode(result_without_wait.screenshot)))
-        image_with_wait = Image.open(io.BytesIO(base64.b64decode(result_with_wait.screenshot)))
-        
+        image_without_wait = Image.open(
+            io.BytesIO(base64.b64decode(result_without_wait.screenshot))
+        )
+        image_with_wait = Image.open(
+            io.BytesIO(base64.b64decode(result_with_wait.screenshot))
+        )
+
         # This is a simple size comparison. In a real-world scenario, you might want to use
         # more sophisticated image comparison techniques.
         assert image_with_wait.size[0] >= image_without_wait.size[0]
         assert image_with_wait.size[1] >= image_without_wait.size[1]
 
+
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
\ No newline at end of file
+    pytest.main([__file__, "-v"])
diff --git a/tests/docker_example.py b/tests/docker_example.py
index 658e80fd..336ca52f 100644
--- a/tests/docker_example.py
+++ b/tests/docker_example.py
@@ -6,53 +6,72 @@ import base64
 import os
 from typing import Dict, Any
 
+
 class Crawl4AiTester:
     def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
         self.base_url = base_url
-        self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN')  # Check environment variable as fallback
-        self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
-        
-    def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
+        self.api_token = api_token or os.getenv(
+            "CRAWL4AI_API_TOKEN"
+        )  # Check environment variable as fallback
+        self.headers = (
+            {"Authorization": f"Bearer {self.api_token}"} if self.api_token else {}
+        )
+
+    def submit_and_wait(
+        self, request_data: Dict[str, Any], timeout: int = 300
+    ) -> Dict[str, Any]:
         # Submit crawl job
-        response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
+        response = requests.post(
+            f"{self.base_url}/crawl", json=request_data, headers=self.headers
+        )
         if response.status_code == 403:
             raise Exception("API token is invalid or missing")
         task_id = response.json()["task_id"]
         print(f"Task ID: {task_id}")
-        
+
         # Poll for result
         start_time = time.time()
         while True:
             if time.time() - start_time > timeout:
-                raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
-                
-            result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
+                raise TimeoutError(
+                    f"Task {task_id} did not complete within {timeout} seconds"
+                )
+
+            result = requests.get(
+                f"{self.base_url}/task/{task_id}", headers=self.headers
+            )
             status = result.json()
-            
+
             if status["status"] == "failed":
                 print("Task failed:", status.get("error"))
                 raise Exception(f"Task failed: {status.get('error')}")
-                
+
             if status["status"] == "completed":
                 return status
-                
+
             time.sleep(2)
-            
+
     def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
-        response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
+        response = requests.post(
+            f"{self.base_url}/crawl_sync",
+            json=request_data,
+            headers=self.headers,
+            timeout=60,
+        )
         if response.status_code == 408:
             raise TimeoutError("Task did not complete within server timeout")
         response.raise_for_status()
         return response.json()
 
+
 def test_docker_deployment(version="basic"):
     tester = Crawl4AiTester(
         # base_url="http://localhost:11235" ,
         base_url="https://crawl4ai-sby74.ondigitalocean.app",
-        api_token="test"
+        api_token="test",
     )
     print(f"Testing Crawl4AI Docker {version} version")
-    
+
     # Health check with timeout and retry
     max_retries = 5
     for i in range(max_retries):
@@ -60,18 +79,18 @@ def test_docker_deployment(version="basic"):
             health = requests.get(f"{tester.base_url}/health", timeout=10)
             print("Health check:", health.json())
             break
-        except requests.exceptions.RequestException as e:
+        except requests.exceptions.RequestException:
             if i == max_retries - 1:
                 print(f"Failed to connect after {max_retries} attempts")
                 sys.exit(1)
             print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
             time.sleep(5)
-    
+
     # Test cases based on version
     test_basic_crawl(tester)
     test_basic_crawl(tester)
     test_basic_crawl_sync(tester)
-    
+
     # if version in ["full", "transformer"]:
     #     test_cosine_extraction(tester)
 
@@ -81,35 +100,37 @@ def test_docker_deployment(version="basic"):
     # test_llm_extraction(tester)
     # test_llm_with_ollama(tester)
     # test_screenshot(tester)
-    
+
 
 def test_basic_crawl(tester: Crawl4AiTester):
     print("\n=== Testing Basic Crawl ===")
     request = {
         "urls": "https://www.nbcnews.com/business",
-        "priority": 10, 
-        "session_id": "test"
+        "priority": 10,
+        "session_id": "test",
     }
-    
+
     result = tester.submit_and_wait(request)
     print(f"Basic crawl result length: {len(result['result']['markdown'])}")
     assert result["result"]["success"]
     assert len(result["result"]["markdown"]) > 0
 
+
 def test_basic_crawl_sync(tester: Crawl4AiTester):
     print("\n=== Testing Basic Crawl (Sync) ===")
     request = {
         "urls": "https://www.nbcnews.com/business",
         "priority": 10,
-        "session_id": "test"
+        "session_id": "test",
     }
-    
+
     result = tester.submit_sync(request)
     print(f"Basic crawl result length: {len(result['result']['markdown'])}")
-    assert result['status'] == 'completed'
-    assert result['result']['success']
-    assert len(result['result']['markdown']) > 0
-    
+    assert result["status"] == "completed"
+    assert result["result"]["success"]
+    assert len(result["result"]["markdown"]) > 0
+
+
 def test_js_execution(tester: Crawl4AiTester):
     print("\n=== Testing JS Execution ===")
     request = {
@@ -119,32 +140,29 @@ def test_js_execution(tester: Crawl4AiTester):
             "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
         ],
         "wait_for": "article.tease-card:nth-child(10)",
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
     }
-    
+
     result = tester.submit_and_wait(request)
     print(f"JS execution result length: {len(result['result']['markdown'])}")
     assert result["result"]["success"]
 
+
 def test_css_selector(tester: Crawl4AiTester):
     print("\n=== Testing CSS Selector ===")
     request = {
         "urls": "https://www.nbcnews.com/business",
         "priority": 7,
         "css_selector": ".wide-tease-item__description",
-        "crawler_params": {
-            "headless": True
-        },
-        "extra": {"word_count_threshold": 10}
-        
+        "crawler_params": {"headless": True},
+        "extra": {"word_count_threshold": 10},
     }
-    
+
     result = tester.submit_and_wait(request)
     print(f"CSS selector result length: {len(result['result']['markdown'])}")
     assert result["result"]["success"]
 
+
 def test_structured_extraction(tester: Crawl4AiTester):
     print("\n=== Testing Structured Extraction ===")
     schema = {
@@ -165,21 +183,16 @@ def test_structured_extraction(tester: Crawl4AiTester):
                 "name": "price",
                 "selector": "td:nth-child(2)",
                 "type": "text",
-            }
+            },
         ],
     }
-    
+
     request = {
         "urls": "https://www.coinbase.com/explore",
         "priority": 9,
-        "extraction_config": {
-            "type": "json_css",
-            "params": {
-                "schema": schema
-            }
-        }
+        "extraction_config": {"type": "json_css", "params": {"schema": schema}},
     }
-    
+
     result = tester.submit_and_wait(request)
     extracted = json.loads(result["result"]["extracted_content"])
     print(f"Extracted {len(extracted)} items")
@@ -187,6 +200,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
     assert result["result"]["success"]
     assert len(extracted) > 0
 
+
 def test_llm_extraction(tester: Crawl4AiTester):
     print("\n=== Testing LLM Extraction ===")
     schema = {
@@ -194,20 +208,20 @@ def test_llm_extraction(tester: Crawl4AiTester):
         "properties": {
             "model_name": {
                 "type": "string",
-                "description": "Name of the OpenAI model."
+                "description": "Name of the OpenAI model.",
             },
             "input_fee": {
                 "type": "string",
-                "description": "Fee for input token for the OpenAI model."
+                "description": "Fee for input token for the OpenAI model.",
             },
             "output_fee": {
                 "type": "string",
-                "description": "Fee for output token for the OpenAI model."
-            }
+                "description": "Fee for output token for the OpenAI model.",
+            },
         },
-        "required": ["model_name", "input_fee", "output_fee"]
+        "required": ["model_name", "input_fee", "output_fee"],
     }
-    
+
     request = {
         "urls": "https://openai.com/api/pricing",
         "priority": 8,
@@ -218,12 +232,12 @@ def test_llm_extraction(tester: Crawl4AiTester):
                 "api_token": os.getenv("OPENAI_API_KEY"),
                 "schema": schema,
                 "extraction_type": "schema",
-                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
-            }
+                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""",
+            },
         },
-        "crawler_params": {"word_count_threshold": 1}
+        "crawler_params": {"word_count_threshold": 1},
     }
-    
+
     try:
         result = tester.submit_and_wait(request)
         extracted = json.loads(result["result"]["extracted_content"])
@@ -233,6 +247,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
     except Exception as e:
         print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")
 
+
 def test_llm_with_ollama(tester: Crawl4AiTester):
     print("\n=== Testing LLM with Ollama ===")
     schema = {
@@ -240,20 +255,20 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
         "properties": {
             "article_title": {
                 "type": "string",
-                "description": "The main title of the news article"
+                "description": "The main title of the news article",
             },
             "summary": {
                 "type": "string",
-                "description": "A brief summary of the article content"
+                "description": "A brief summary of the article content",
             },
             "main_topics": {
                 "type": "array",
                 "items": {"type": "string"},
-                "description": "Main topics or themes discussed in the article"
-            }
-        }
+                "description": "Main topics or themes discussed in the article",
+            },
+        },
     }
-    
+
     request = {
         "urls": "https://www.nbcnews.com/business",
         "priority": 8,
@@ -263,13 +278,13 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
                 "provider": "ollama/llama2",
                 "schema": schema,
                 "extraction_type": "schema",
-                "instruction": "Extract the main article information including title, summary, and main topics."
-            }
+                "instruction": "Extract the main article information including title, summary, and main topics.",
+            },
         },
         "extra": {"word_count_threshold": 1},
-        "crawler_params": {"verbose": True}
+        "crawler_params": {"verbose": True},
     }
-    
+
     try:
         result = tester.submit_and_wait(request)
         extracted = json.loads(result["result"]["extracted_content"])
@@ -278,6 +293,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
     except Exception as e:
         print(f"Ollama extraction test failed: {str(e)}")
 
+
 def test_cosine_extraction(tester: Crawl4AiTester):
     print("\n=== Testing Cosine Extraction ===")
     request = {
@@ -289,11 +305,11 @@ def test_cosine_extraction(tester: Crawl4AiTester):
                 "semantic_filter": "business finance economy",
                 "word_count_threshold": 10,
                 "max_dist": 0.2,
-                "top_k": 3
-            }
-        }
+                "top_k": 3,
+            },
+        },
     }
-    
+
     try:
         result = tester.submit_and_wait(request)
         extracted = json.loads(result["result"]["extracted_content"])
@@ -303,30 +319,30 @@ def test_cosine_extraction(tester: Crawl4AiTester):
     except Exception as e:
         print(f"Cosine extraction test failed: {str(e)}")
 
+
 def test_screenshot(tester: Crawl4AiTester):
     print("\n=== Testing Screenshot ===")
     request = {
         "urls": "https://www.nbcnews.com/business",
         "priority": 5,
         "screenshot": True,
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
     }
-    
+
     result = tester.submit_and_wait(request)
     print("Screenshot captured:", bool(result["result"]["screenshot"]))
-    
+
     if result["result"]["screenshot"]:
         # Save screenshot
         screenshot_data = base64.b64decode(result["result"]["screenshot"])
         with open("test_screenshot.jpg", "wb") as f:
             f.write(screenshot_data)
         print("Screenshot saved as test_screenshot.jpg")
-    
+
     assert result["result"]["success"]
 
+
 if __name__ == "__main__":
     version = sys.argv[1] if len(sys.argv) > 1 else "basic"
     # version = "full"
-    test_docker_deployment(version)
\ No newline at end of file
+    test_docker_deployment(version)
diff --git a/tests/test_cli_docs.py b/tests/test_cli_docs.py
index 9d2a7841..6941f20d 100644
--- a/tests/test_cli_docs.py
+++ b/tests/test_cli_docs.py
@@ -1,13 +1,13 @@
 import asyncio
-from pathlib import Path
 from crawl4ai.docs_manager import DocsManager
 from click.testing import CliRunner
 from crawl4ai.cli import cli
 
+
 def test_cli():
     """Test all CLI commands"""
     runner = CliRunner()
-    
+
     print("\n1. Testing docs update...")
     # Use sync version for testing
     docs_manager = DocsManager()
@@ -27,17 +27,18 @@ def test_cli():
     # print("\n3. Testing search...")
     # result = runner.invoke(cli, ['docs', 'search', 'how to use crawler', '--build-index'])
     # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
-    # print(f"First 200 chars: {result.output[:200]}...")    
-    
+    # print(f"First 200 chars: {result.output[:200]}...")
+
     # print("\n4. Testing combine with sections...")
     # result = runner.invoke(cli, ['docs', 'combine', 'chunking_strategies', 'extraction_strategies', '--mode', 'extended'])
     # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
     # print(f"First 200 chars: {result.output[:200]}...")
 
     print("\n5. Testing combine all sections...")
-    result = runner.invoke(cli, ['docs', 'combine', '--mode', 'condensed'])
+    result = runner.invoke(cli, ["docs", "combine", "--mode", "condensed"])
     print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
     print(f"First 200 chars: {result.output[:200]}...")
 
+
 if __name__ == "__main__":
-    test_cli()
\ No newline at end of file
+    test_cli()
diff --git a/tests/test_docker.py b/tests/test_docker.py
index c22acd55..3570d608 100644
--- a/tests/test_docker.py
+++ b/tests/test_docker.py
@@ -6,38 +6,44 @@ import base64
 import os
 from typing import Dict, Any
 
+
 class Crawl4AiTester:
     def __init__(self, base_url: str = "http://localhost:11235"):
         self.base_url = base_url
-        
-    def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
+
+    def submit_and_wait(
+        self, request_data: Dict[str, Any], timeout: int = 300
+    ) -> Dict[str, Any]:
         # Submit crawl job
         response = requests.post(f"{self.base_url}/crawl", json=request_data)
         task_id = response.json()["task_id"]
         print(f"Task ID: {task_id}")
-        
+
         # Poll for result
         start_time = time.time()
         while True:
             if time.time() - start_time > timeout:
-                raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
-                
+                raise TimeoutError(
+                    f"Task {task_id} did not complete within {timeout} seconds"
+                )
+
             result = requests.get(f"{self.base_url}/task/{task_id}")
             status = result.json()
-            
+
             if status["status"] == "failed":
                 print("Task failed:", status.get("error"))
                 raise Exception(f"Task failed: {status.get('error')}")
-                
+
             if status["status"] == "completed":
                 return status
-                
+
             time.sleep(2)
 
+
 def test_docker_deployment(version="basic"):
     tester = Crawl4AiTester()
     print(f"Testing Crawl4AI Docker {version} version")
-    
+
     # Health check with timeout and retry
     max_retries = 5
     for i in range(max_retries):
@@ -45,16 +51,16 @@ def test_docker_deployment(version="basic"):
             health = requests.get(f"{tester.base_url}/health", timeout=10)
             print("Health check:", health.json())
             break
-        except requests.exceptions.RequestException as e:
+        except requests.exceptions.RequestException:
             if i == max_retries - 1:
                 print(f"Failed to connect after {max_retries} attempts")
                 sys.exit(1)
             print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
             time.sleep(5)
-    
+
     # Test cases based on version
     test_basic_crawl(tester)
-    
+
     # if version in ["full", "transformer"]:
     #     test_cosine_extraction(tester)
 
@@ -64,20 +70,18 @@ def test_docker_deployment(version="basic"):
     # test_llm_extraction(tester)
     # test_llm_with_ollama(tester)
     # test_screenshot(tester)
-    
+
 
 def test_basic_crawl(tester: Crawl4AiTester):
     print("\n=== Testing Basic Crawl ===")
-    request = {
-        "urls": "https://www.nbcnews.com/business",
-        "priority": 10
-    }
-    
+    request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
+
     result = tester.submit_and_wait(request)
     print(f"Basic crawl result length: {len(result['result']['markdown'])}")
     assert result["result"]["success"]
     assert len(result["result"]["markdown"]) > 0
 
+
 def test_js_execution(tester: Crawl4AiTester):
     print("\n=== Testing JS Execution ===")
     request = {
@@ -87,32 +91,29 @@ def test_js_execution(tester: Crawl4AiTester):
             "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
         ],
         "wait_for": "article.tease-card:nth-child(10)",
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
     }
-    
+
     result = tester.submit_and_wait(request)
     print(f"JS execution result length: {len(result['result']['markdown'])}")
     assert result["result"]["success"]
 
+
 def test_css_selector(tester: Crawl4AiTester):
     print("\n=== Testing CSS Selector ===")
     request = {
         "urls": "https://www.nbcnews.com/business",
         "priority": 7,
         "css_selector": ".wide-tease-item__description",
-        "crawler_params": {
-            "headless": True
-        },
-        "extra": {"word_count_threshold": 10}
-        
+        "crawler_params": {"headless": True},
+        "extra": {"word_count_threshold": 10},
     }
-    
+
     result = tester.submit_and_wait(request)
     print(f"CSS selector result length: {len(result['result']['markdown'])}")
     assert result["result"]["success"]
 
+
 def test_structured_extraction(tester: Crawl4AiTester):
     print("\n=== Testing Structured Extraction ===")
     schema = {
@@ -133,21 +134,16 @@ def test_structured_extraction(tester: Crawl4AiTester):
                 "name": "price",
                 "selector": "td:nth-child(2)",
                 "type": "text",
-            }
+            },
         ],
     }
-    
+
     request = {
         "urls": "https://www.coinbase.com/explore",
         "priority": 9,
-        "extraction_config": {
-            "type": "json_css",
-            "params": {
-                "schema": schema
-            }
-        }
+        "extraction_config": {"type": "json_css", "params": {"schema": schema}},
     }
-    
+
     result = tester.submit_and_wait(request)
     extracted = json.loads(result["result"]["extracted_content"])
     print(f"Extracted {len(extracted)} items")
@@ -155,6 +151,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
     assert result["result"]["success"]
     assert len(extracted) > 0
 
+
 def test_llm_extraction(tester: Crawl4AiTester):
     print("\n=== Testing LLM Extraction ===")
     schema = {
@@ -162,20 +159,20 @@ def test_llm_extraction(tester: Crawl4AiTester):
         "properties": {
             "model_name": {
                 "type": "string",
-                "description": "Name of the OpenAI model."
+                "description": "Name of the OpenAI model.",
             },
             "input_fee": {
                 "type": "string",
-                "description": "Fee for input token for the OpenAI model."
+                "description": "Fee for input token for the OpenAI model.",
             },
             "output_fee": {
                 "type": "string",
-                "description": "Fee for output token for the OpenAI model."
-            }
+                "description": "Fee for output token for the OpenAI model.",
+            },
         },
-        "required": ["model_name", "input_fee", "output_fee"]
+        "required": ["model_name", "input_fee", "output_fee"],
     }
-    
+
     request = {
         "urls": "https://openai.com/api/pricing",
         "priority": 8,
@@ -186,12 +183,12 @@ def test_llm_extraction(tester: Crawl4AiTester):
                 "api_token": os.getenv("OPENAI_API_KEY"),
                 "schema": schema,
                 "extraction_type": "schema",
-                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
-            }
+                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""",
+            },
         },
-        "crawler_params": {"word_count_threshold": 1}
+        "crawler_params": {"word_count_threshold": 1},
     }
-    
+
     try:
         result = tester.submit_and_wait(request)
         extracted = json.loads(result["result"]["extracted_content"])
@@ -201,6 +198,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
     except Exception as e:
         print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")
 
+
 def test_llm_with_ollama(tester: Crawl4AiTester):
     print("\n=== Testing LLM with Ollama ===")
     schema = {
@@ -208,20 +206,20 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
         "properties": {
             "article_title": {
                 "type": "string",
-                "description": "The main title of the news article"
+                "description": "The main title of the news article",
             },
             "summary": {
                 "type": "string",
-                "description": "A brief summary of the article content"
+                "description": "A brief summary of the article content",
             },
             "main_topics": {
                 "type": "array",
                 "items": {"type": "string"},
-                "description": "Main topics or themes discussed in the article"
-            }
-        }
+                "description": "Main topics or themes discussed in the article",
+            },
+        },
     }
-    
+
     request = {
         "urls": "https://www.nbcnews.com/business",
         "priority": 8,
@@ -231,13 +229,13 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
                 "provider": "ollama/llama2",
                 "schema": schema,
                 "extraction_type": "schema",
-                "instruction": "Extract the main article information including title, summary, and main topics."
-            }
+                "instruction": "Extract the main article information including title, summary, and main topics.",
+            },
         },
         "extra": {"word_count_threshold": 1},
-        "crawler_params": {"verbose": True}
+        "crawler_params": {"verbose": True},
     }
-    
+
     try:
         result = tester.submit_and_wait(request)
         extracted = json.loads(result["result"]["extracted_content"])
@@ -246,6 +244,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
     except Exception as e:
         print(f"Ollama extraction test failed: {str(e)}")
 
+
 def test_cosine_extraction(tester: Crawl4AiTester):
     print("\n=== Testing Cosine Extraction ===")
     request = {
@@ -257,11 +256,11 @@ def test_cosine_extraction(tester: Crawl4AiTester):
                 "semantic_filter": "business finance economy",
                 "word_count_threshold": 10,
                 "max_dist": 0.2,
-                "top_k": 3
-            }
-        }
+                "top_k": 3,
+            },
+        },
     }
-    
+
     try:
         result = tester.submit_and_wait(request)
         extracted = json.loads(result["result"]["extracted_content"])
@@ -271,30 +270,30 @@ def test_cosine_extraction(tester: Crawl4AiTester):
     except Exception as e:
         print(f"Cosine extraction test failed: {str(e)}")
 
+
 def test_screenshot(tester: Crawl4AiTester):
     print("\n=== Testing Screenshot ===")
     request = {
         "urls": "https://www.nbcnews.com/business",
         "priority": 5,
         "screenshot": True,
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
     }
-    
+
     result = tester.submit_and_wait(request)
     print("Screenshot captured:", bool(result["result"]["screenshot"]))
-    
+
     if result["result"]["screenshot"]:
         # Save screenshot
         screenshot_data = base64.b64decode(result["result"]["screenshot"])
         with open("test_screenshot.jpg", "wb") as f:
             f.write(screenshot_data)
         print("Screenshot saved as test_screenshot.jpg")
-    
+
     assert result["result"]["success"]
 
+
 if __name__ == "__main__":
     version = sys.argv[1] if len(sys.argv) > 1 else "basic"
     # version = "full"
-    test_docker_deployment(version)
\ No newline at end of file
+    test_docker_deployment(version)
diff --git a/tests/test_llmtxt.py b/tests/test_llmtxt.py
index bdbe5c27..2cdb0271 100644
--- a/tests/test_llmtxt.py
+++ b/tests/test_llmtxt.py
@@ -3,20 +3,21 @@ from crawl4ai.async_logger import AsyncLogger
 from pathlib import Path
 import asyncio
 
+
 async def main():
     current_file = Path(__file__).resolve()
     # base_dir = current_file.parent.parent / "local/_docs/llm.txt/test_docs"
     base_dir = current_file.parent.parent / "local/_docs/llm.txt"
     docs_dir = base_dir
-    
+
     # Create directory if it doesn't exist
     docs_dir.mkdir(parents=True, exist_ok=True)
-   
+
     # Initialize logger
     logger = AsyncLogger()
     # Updated initialization with default batching params
     # manager = AsyncLLMTextManager(docs_dir, logger, max_concurrent_calls=3, batch_size=2)
-    manager = AsyncLLMTextManager(docs_dir, logger,  batch_size=2)
+    manager = AsyncLLMTextManager(docs_dir, logger, batch_size=2)
 
     # Let's first check what files we have
     print("\nAvailable files:")
@@ -26,8 +27,7 @@ async def main():
     # Generate index files
     print("\nGenerating index files...")
     await manager.generate_index_files(
-        force_generate_facts=False,
-        clear_bm25_cache=False
+        force_generate_facts=False, clear_bm25_cache=False
     )
 
     # Test some relevant queries about Crawl4AI
@@ -41,9 +41,12 @@ async def main():
         results = manager.search(query, top_k=2)
         print(f"Results length: {len(results)} characters")
         if results:
-            print("First 200 chars of results:", results[:200].replace('\n', ' '), "...")
+            print(
+                "First 200 chars of results:", results[:200].replace("\n", " "), "..."
+            )
         else:
             print("No results found")
 
+
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/tests/test_main.py b/tests/test_main.py
index 19f938c8..0e938f59 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -3,8 +3,8 @@ import aiohttp
 import json
 import time
 import os
-from typing import Optional, Dict, Any
-from pydantic import BaseModel, HttpUrl
+from typing import Dict, Any
+
 
 class NBCNewsAPITest:
     def __init__(self, base_url: str = "http://localhost:8000"):
@@ -20,7 +20,9 @@ class NBCNewsAPITest:
             await self.session.close()
 
     async def submit_crawl(self, request_data: Dict[str, Any]) -> str:
-        async with self.session.post(f"{self.base_url}/crawl", json=request_data) as response:
+        async with self.session.post(
+            f"{self.base_url}/crawl", json=request_data
+        ) as response:
             result = await response.json()
             return result["task_id"]
 
@@ -28,11 +30,15 @@ class NBCNewsAPITest:
         async with self.session.get(f"{self.base_url}/task/{task_id}") as response:
             return await response.json()
 
-    async def wait_for_task(self, task_id: str, timeout: int = 300, poll_interval: int = 2) -> Dict[str, Any]:
+    async def wait_for_task(
+        self, task_id: str, timeout: int = 300, poll_interval: int = 2
+    ) -> Dict[str, Any]:
         start_time = time.time()
         while True:
             if time.time() - start_time > timeout:
-                raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
+                raise TimeoutError(
+                    f"Task {task_id} did not complete within {timeout} seconds"
+                )
 
             status = await self.get_task_status(task_id)
             if status["status"] in ["completed", "failed"]:
@@ -44,13 +50,11 @@ class NBCNewsAPITest:
         async with self.session.get(f"{self.base_url}/health") as response:
             return await response.json()
 
+
 async def test_basic_crawl():
     print("\n=== Testing Basic Crawl ===")
     async with NBCNewsAPITest() as api:
-        request = {
-            "urls": "https://www.nbcnews.com/business",
-            "priority": 10
-        }
+        request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
         task_id = await api.submit_crawl(request)
         result = await api.wait_for_task(task_id)
         print(f"Basic crawl result length: {len(result['result']['markdown'])}")
@@ -58,6 +62,7 @@ async def test_basic_crawl():
         assert "result" in result
         assert result["result"]["success"]
 
+
 async def test_js_execution():
     print("\n=== Testing JS Execution ===")
     async with NBCNewsAPITest() as api:
@@ -68,9 +73,7 @@ async def test_js_execution():
                 "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
             ],
             "wait_for": "article.tease-card:nth-child(10)",
-            "crawler_params": {
-                "headless": True
-            }
+            "crawler_params": {"headless": True},
         }
         task_id = await api.submit_crawl(request)
         result = await api.wait_for_task(task_id)
@@ -78,13 +81,14 @@ async def test_js_execution():
         assert result["status"] == "completed"
         assert result["result"]["success"]
 
+
 async def test_css_selector():
     print("\n=== Testing CSS Selector ===")
     async with NBCNewsAPITest() as api:
         request = {
             "urls": "https://www.nbcnews.com/business",
             "priority": 7,
-            "css_selector": ".wide-tease-item__description"
+            "css_selector": ".wide-tease-item__description",
         }
         task_id = await api.submit_crawl(request)
         result = await api.wait_for_task(task_id)
@@ -92,6 +96,7 @@ async def test_css_selector():
         assert result["status"] == "completed"
         assert result["result"]["success"]
 
+
 async def test_structured_extraction():
     print("\n=== Testing Structured Extraction ===")
     async with NBCNewsAPITest() as api:
@@ -99,34 +104,25 @@ async def test_structured_extraction():
             "name": "NBC News Articles",
             "baseSelector": "article.tease-card",
             "fields": [
-                {
-                    "name": "title",
-                    "selector": "h2",
-                    "type": "text"
-                },
+                {"name": "title", "selector": "h2", "type": "text"},
                 {
                     "name": "description",
                     "selector": ".tease-card__description",
-                    "type": "text"
+                    "type": "text",
                 },
                 {
                     "name": "link",
                     "selector": "a",
                     "type": "attribute",
-                    "attribute": "href"
-                }
-            ]
+                    "attribute": "href",
+                },
+            ],
         }
-        
+
         request = {
             "urls": "https://www.nbcnews.com/business",
             "priority": 9,
-            "extraction_config": {
-                "type": "json_css",
-                "params": {
-                    "schema": schema
-                }
-            }
+            "extraction_config": {"type": "json_css", "params": {"schema": schema}},
         }
         task_id = await api.submit_crawl(request)
         result = await api.wait_for_task(task_id)
@@ -136,6 +132,7 @@ async def test_structured_extraction():
         assert result["result"]["success"]
         assert len(extracted) > 0
 
+
 async def test_batch_crawl():
     print("\n=== Testing Batch Crawl ===")
     async with NBCNewsAPITest() as api:
@@ -143,12 +140,10 @@ async def test_batch_crawl():
             "urls": [
                 "https://www.nbcnews.com/business",
                 "https://www.nbcnews.com/business/consumer",
-                "https://www.nbcnews.com/business/economy"
+                "https://www.nbcnews.com/business/economy",
             ],
             "priority": 6,
-            "crawler_params": {
-                "headless": True
-            }
+            "crawler_params": {"headless": True},
         }
         task_id = await api.submit_crawl(request)
         result = await api.wait_for_task(task_id)
@@ -157,6 +152,7 @@ async def test_batch_crawl():
         assert "results" in result
         assert len(result["results"]) == 3
 
+
 async def test_llm_extraction():
     print("\n=== Testing LLM Extraction with Ollama ===")
     async with NBCNewsAPITest() as api:
@@ -165,19 +161,19 @@ async def test_llm_extraction():
             "properties": {
                 "article_title": {
                     "type": "string",
-                    "description": "The main title of the news article"
+                    "description": "The main title of the news article",
                 },
                 "summary": {
                     "type": "string",
-                    "description": "A brief summary of the article content"
+                    "description": "A brief summary of the article content",
                 },
                 "main_topics": {
                     "type": "array",
                     "items": {"type": "string"},
-                    "description": "Main topics or themes discussed in the article"
-                }
+                    "description": "Main topics or themes discussed in the article",
+                },
             },
-            "required": ["article_title", "summary", "main_topics"]
+            "required": ["article_title", "summary", "main_topics"],
         }
 
         request = {
@@ -191,26 +187,24 @@ async def test_llm_extraction():
                     "schema": schema,
                     "extraction_type": "schema",
                     "instruction": """Extract the main article information including title, a brief summary, and main topics discussed. 
-                    Focus on the primary business news article on the page."""
-                }
+                    Focus on the primary business news article on the page.""",
+                },
             },
-            "crawler_params": {
-                "headless": True,
-                "word_count_threshold": 1
-            }
+            "crawler_params": {"headless": True, "word_count_threshold": 1},
         }
-        
+
         task_id = await api.submit_crawl(request)
         result = await api.wait_for_task(task_id)
-        
+
         if result["status"] == "completed":
             extracted = json.loads(result["result"]["extracted_content"])
-            print(f"Extracted article analysis:")
+            print("Extracted article analysis:")
             print(json.dumps(extracted, indent=2))
-        
+
         assert result["status"] == "completed"
         assert result["result"]["success"]
 
+
 async def test_screenshot():
     print("\n=== Testing Screenshot ===")
     async with NBCNewsAPITest() as api:
@@ -218,9 +212,7 @@ async def test_screenshot():
             "urls": "https://www.nbcnews.com/business",
             "priority": 5,
             "screenshot": True,
-            "crawler_params": {
-                "headless": True
-            }
+            "crawler_params": {"headless": True},
         }
         task_id = await api.submit_crawl(request)
         result = await api.wait_for_task(task_id)
@@ -229,6 +221,7 @@ async def test_screenshot():
         assert result["result"]["success"]
         assert result["result"]["screenshot"] is not None
 
+
 async def test_priority_handling():
     print("\n=== Testing Priority Handling ===")
     async with NBCNewsAPITest() as api:
@@ -236,7 +229,7 @@ async def test_priority_handling():
         low_priority = {
             "urls": "https://www.nbcnews.com/business",
             "priority": 1,
-            "crawler_params": {"headless": True}
+            "crawler_params": {"headless": True},
         }
         low_task_id = await api.submit_crawl(low_priority)
 
@@ -244,7 +237,7 @@ async def test_priority_handling():
         high_priority = {
             "urls": "https://www.nbcnews.com/business/consumer",
             "priority": 10,
-            "crawler_params": {"headless": True}
+            "crawler_params": {"headless": True},
         }
         high_task_id = await api.submit_crawl(high_priority)
 
@@ -256,6 +249,7 @@ async def test_priority_handling():
         assert high_result["status"] == "completed"
         assert low_result["status"] == "completed"
 
+
 async def main():
     try:
         # Start with health check
@@ -277,5 +271,6 @@ async def main():
         print(f"Test failed: {str(e)}")
         raise
 
+
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/tests/test_scraping_strategy.py b/tests/test_scraping_strategy.py
new file mode 100644
index 00000000..425d02c9
--- /dev/null
+++ b/tests/test_scraping_strategy.py
@@ -0,0 +1,26 @@
+import nest_asyncio
+
+nest_asyncio.apply()
+
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    LXMLWebScrapingStrategy,
+    CacheMode,
+)
+
+
+async def main():
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        scraping_strategy=LXMLWebScrapingStrategy(),  # Faster alternative to default BeautifulSoup
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=config)
+        print(f"Success: {result.success}")
+        print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests/test_web_crawler.py b/tests/test_web_crawler.py
index 99360f42..d6eddfdc 100644
--- a/tests/test_web_crawler.py
+++ b/tests/test_web_crawler.py
@@ -1,79 +1,105 @@
 import unittest, os
 from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking
-from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy
+from crawl4ai.chunking_strategy import (
+    RegexChunking,
+    FixedLengthWordChunking,
+    SlidingWindowChunking,
+)
+from crawl4ai.extraction_strategy import (
+    CosineStrategy,
+    LLMExtractionStrategy,
+    TopicExtractionStrategy,
+    NoExtractionStrategy,
+)
+
 
 class TestWebCrawler(unittest.TestCase):
-    
     def setUp(self):
         self.crawler = WebCrawler()
-    
+
     def test_warmup(self):
         self.crawler.warmup()
         self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up")
-    
+
     def test_run_default_strategies(self):
         result = self.crawler.run(
-            url='https://www.nbcnews.com/business',
+            url="https://www.nbcnews.com/business",
             word_count_threshold=5,
             chunking_strategy=RegexChunking(),
-            extraction_strategy=CosineStrategy(), bypass_cache=True
+            extraction_strategy=CosineStrategy(),
+            bypass_cache=True,
         )
-        self.assertTrue(result.success, "Failed to crawl and extract using default strategies")
-    
+        self.assertTrue(
+            result.success, "Failed to crawl and extract using default strategies"
+        )
+
     def test_run_different_strategies(self):
-        url = 'https://www.nbcnews.com/business'
-        
+        url = "https://www.nbcnews.com/business"
+
         # Test with FixedLengthWordChunking and LLMExtractionStrategy
         result = self.crawler.run(
             url=url,
             word_count_threshold=5,
             chunking_strategy=FixedLengthWordChunking(chunk_size=100),
-            extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True
+            extraction_strategy=LLMExtractionStrategy(
+                provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")
+            ),
+            bypass_cache=True,
         )
-        self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy")
-        
+        self.assertTrue(
+            result.success,
+            "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy",
+        )
+
         # Test with SlidingWindowChunking and TopicExtractionStrategy
         result = self.crawler.run(
             url=url,
             word_count_threshold=5,
             chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
-            extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True
+            extraction_strategy=TopicExtractionStrategy(num_keywords=5),
+            bypass_cache=True,
         )
-        self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy")
-    
+        self.assertTrue(
+            result.success,
+            "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy",
+        )
+
     def test_invalid_url(self):
         with self.assertRaises(Exception) as context:
-            self.crawler.run(url='invalid_url', bypass_cache=True)
+            self.crawler.run(url="invalid_url", bypass_cache=True)
         self.assertIn("Invalid URL", str(context.exception))
-    
+
     def test_unsupported_extraction_strategy(self):
         with self.assertRaises(Exception) as context:
-            self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True)
+            self.crawler.run(
+                url="https://www.nbcnews.com/business",
+                extraction_strategy="UnsupportedStrategy",
+                bypass_cache=True,
+            )
         self.assertIn("Unsupported extraction strategy", str(context.exception))
-    
+
     def test_invalid_css_selector(self):
         with self.assertRaises(ValueError) as context:
-            self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True)
+            self.crawler.run(
+                url="https://www.nbcnews.com/business",
+                css_selector="invalid_selector",
+                bypass_cache=True,
+            )
         self.assertIn("Invalid CSS selector", str(context.exception))
 
-    
     def test_crawl_with_cache_and_bypass_cache(self):
-        url = 'https://www.nbcnews.com/business'
-        
+        url = "https://www.nbcnews.com/business"
+
         # First crawl with cache enabled
         result = self.crawler.run(url=url, bypass_cache=False)
         self.assertTrue(result.success, "Failed to crawl and cache the result")
-        
+
         # Second crawl with bypass_cache=True
         result = self.crawler.run(url=url, bypass_cache=True)
         self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data")
-    
+
     def test_fetch_multiple_pages(self):
-        urls = [
-            'https://www.nbcnews.com/business',
-            'https://www.bbc.com/news'
-        ]
+        urls = ["https://www.nbcnews.com/business", "https://www.bbc.com/news"]
         results = []
         for url in urls:
             result = self.crawler.run(
@@ -81,31 +107,42 @@ class TestWebCrawler(unittest.TestCase):
                 word_count_threshold=5,
                 chunking_strategy=RegexChunking(),
                 extraction_strategy=CosineStrategy(),
-                bypass_cache=True
+                bypass_cache=True,
             )
             results.append(result)
-        
+
         self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages")
         for result in results:
-            self.assertTrue(result.success, "Failed to crawl and extract a page in the list")
-    
+            self.assertTrue(
+                result.success, "Failed to crawl and extract a page in the list"
+            )
+
     def test_run_fixed_length_word_chunking_and_no_extraction(self):
         result = self.crawler.run(
-            url='https://www.nbcnews.com/business',
+            url="https://www.nbcnews.com/business",
             word_count_threshold=5,
             chunking_strategy=FixedLengthWordChunking(chunk_size=100),
-            extraction_strategy=NoExtractionStrategy(), bypass_cache=True
+            extraction_strategy=NoExtractionStrategy(),
+            bypass_cache=True,
+        )
+        self.assertTrue(
+            result.success,
+            "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy",
         )
-        self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy")
 
     def test_run_sliding_window_and_no_extraction(self):
         result = self.crawler.run(
-            url='https://www.nbcnews.com/business',
+            url="https://www.nbcnews.com/business",
             word_count_threshold=5,
             chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
-            extraction_strategy=NoExtractionStrategy(), bypass_cache=True
+            extraction_strategy=NoExtractionStrategy(),
+            bypass_cache=True,
+        )
+        self.assertTrue(
+            result.success,
+            "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy",
         )
-        self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy")
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()